dots/chezmoi/dot_config/tmux/plugins/extrakto/executable_extrakto.py
aleidk 224c7ed45c Migrate to chezmoi
Move config files from config to chezmoi
Add script to auto install packages with DNF and Cargo
2024-03-01 20:26:02 -03:00

215 lines
6.2 KiB
Python

#!/usr/bin/env python3
import os
import re
import sys
from argparse import ArgumentParser
from collections import OrderedDict
from configparser import ConfigParser
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
# "words" consist of anything but the following characters:
# [](){}=$
# unicode range 2500-27BF which includes:
# - Box Drawing
# - Block Elements
# - Geometric Shapes
# - Miscellaneous Symbols
# - Dingbats
# unicode range E000-F8FF (private use/Powerline)
# and whitespace ( \t\n\r)
RE_WORD = "[^][(){}=$\u2500-\u27BF\uE000-\uF8FF \\t\\n\\r]+"
class Extrakto:
def __init__(self, *, min_length=5, alt=False, prefix_name=False):
conf = ConfigParser(interpolation=None)
default_conf = os.path.join(SCRIPT_DIR, "extrakto.conf")
user_conf = os.path.join(
os.path.expanduser("~/.config"), "extrakto/extrakto.conf"
)
conf.read([default_conf, user_conf])
sections = conf.sections()
if not "path" in sections or not "url" in sections:
raise Exception("extrakto.conf incomplete, path and url must exist")
self.min_length = min_length
self.alt = alt
self.prefix_name = prefix_name
self.in_all = []
self.fdict = {}
for name in sections:
sect = conf[name]
alt = []
for i in range(2, 10):
key = f"alt{i}"
# if alt2, alt{n} exists as a value in a section, create a variant based on that regex
if key in sect:
alt.append(sect[key])
if sect.getboolean("in_all", fallback=True):
self.in_all.append(name)
if sect.getboolean("enabled", fallback=True):
self.fdict[name] = FilterDef(
self,
name,
regex=sect.get("regex"),
exclude=sect.get("exclude", ""),
lstrip=sect.get("lstrip", ""),
rstrip=sect.get("rstrip", ""),
alt=alt,
)
def __getitem__(self, key):
if not key in self.fdict:
raise Exception(f"Unknown filter {key}")
return self.fdict[key]
def all(self):
return self.in_all
def keys(self):
return list(self.fdict.keys())
class FilterDef:
def __init__(self, extrakto, name, *, regex, exclude, lstrip, rstrip, alt):
self.extrakto = extrakto
self.name = name
self.regex = regex
self.exclude = exclude
self.lstrip = lstrip
self.rstrip = rstrip
self.alt = alt
def filter(self, text):
res = list()
if self.extrakto.prefix_name:
add = lambda name, value: res.append(f"{name}: {value}")
else:
add = lambda name, value: res.append(value)
for m in re.finditer(self.regex, "\n" + text, flags=re.I):
item = "".join(filter(None, m.groups()))
# strip invalid characters (like punctuation or markdown syntax)
if self.lstrip:
item = item.lstrip(self.lstrip)
if self.rstrip:
item = item.rstrip(self.rstrip)
if len(item) >= self.extrakto.min_length:
if not self.exclude or not re.search(self.exclude, item, re.I):
if self.extrakto.alt:
for i, altre in enumerate(self.alt):
m = re.search(altre, item)
if m:
add(f"{self.name}{i+2}", m[1])
add(self.name, item)
return res
def get_lines(text, *, min_length=5, prefix_name=False):
lines = []
for raw_line in text.splitlines():
line = raw_line.strip()
if len(line) >= min_length:
if prefix_name:
lines.append("line: " + line)
else:
lines.append(line)
return lines
def main(parser):
args = parser.parse_args()
run_list = []
if args.words:
run_list.append("word")
if args.paths:
run_list.append("path")
if args.urls:
run_list.append("url")
run_list += args.add
res = []
# input from the terminal can cause UnicodeDecodeErrors in some instances, ignore for now
text = sys.stdin.buffer.read().decode("utf-8", "ignore")
extrakto = Extrakto(min_length=args.min_length, alt=args.alt, prefix_name=args.name)
if args.all:
run_list = extrakto.all()
if args.lines:
res += get_lines(text, min_length=args.min_length, prefix_name=args.name)
for name in run_list:
res += extrakto[name].filter(text)
if res:
if args.reverse:
res.reverse()
# remove duplicates and print
for item in OrderedDict.fromkeys(res):
print(item)
elif args.warn_empty:
print("NO MATCH - use a different filter")
if __name__ == "__main__":
parser = ArgumentParser(description="Extracts tokens from plaintext.")
parser.add_argument(
"--name", action="store_true", help="prefix filter name in the output"
)
parser.add_argument(
"-w", "--words", action="store_true", help='extract "word" tokens'
)
parser.add_argument("-l", "--lines", action="store_true", help="extract lines")
parser.add_argument(
"--all",
action="store_true",
help="extract using all filters defined in extrakto.conf",
)
parser.add_argument(
"-a", "--add", action="append", default=[], help="add custom filter"
)
parser.add_argument("-p", "--paths", action="store_true", help="short for -a=path")
parser.add_argument("-u", "--urls", action="store_true", help="short for -a=url")
parser.add_argument(
"--alt",
action="store_true",
help="return alternate variants for each match (e.g. https://example.com and example.com)",
)
parser.add_argument("-r", "--reverse", action="store_true", help="reverse output")
parser.add_argument(
"-m", "--min-length", default=5, help="minimum token length", type=int
)
parser.add_argument(
"--warn-empty", action="store_true", help="warn if result is empty"
)
main(parser)