Move config files from config to chezmoi Add script to auto install packages with DNF and Cargo
215 lines
6.2 KiB
Python
215 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
from argparse import ArgumentParser
|
|
from collections import OrderedDict
|
|
from configparser import ConfigParser
|
|
|
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
# "words" consist of anything but the following characters:
|
|
# [](){}=$
|
|
# unicode range 2500-27BF which includes:
|
|
# - Box Drawing
|
|
# - Block Elements
|
|
# - Geometric Shapes
|
|
# - Miscellaneous Symbols
|
|
# - Dingbats
|
|
# unicode range E000-F8FF (private use/Powerline)
|
|
# and whitespace ( \t\n\r)
|
|
RE_WORD = "[^][(){}=$\u2500-\u27BF\uE000-\uF8FF \\t\\n\\r]+"
|
|
|
|
|
|
class Extrakto:
|
|
def __init__(self, *, min_length=5, alt=False, prefix_name=False):
|
|
conf = ConfigParser(interpolation=None)
|
|
default_conf = os.path.join(SCRIPT_DIR, "extrakto.conf")
|
|
user_conf = os.path.join(
|
|
os.path.expanduser("~/.config"), "extrakto/extrakto.conf"
|
|
)
|
|
|
|
conf.read([default_conf, user_conf])
|
|
sections = conf.sections()
|
|
|
|
if not "path" in sections or not "url" in sections:
|
|
raise Exception("extrakto.conf incomplete, path and url must exist")
|
|
|
|
self.min_length = min_length
|
|
self.alt = alt
|
|
self.prefix_name = prefix_name
|
|
|
|
self.in_all = []
|
|
self.fdict = {}
|
|
|
|
for name in sections:
|
|
sect = conf[name]
|
|
alt = []
|
|
for i in range(2, 10):
|
|
key = f"alt{i}"
|
|
|
|
# if alt2, alt{n} exists as a value in a section, create a variant based on that regex
|
|
if key in sect:
|
|
alt.append(sect[key])
|
|
|
|
if sect.getboolean("in_all", fallback=True):
|
|
self.in_all.append(name)
|
|
|
|
if sect.getboolean("enabled", fallback=True):
|
|
self.fdict[name] = FilterDef(
|
|
self,
|
|
name,
|
|
regex=sect.get("regex"),
|
|
exclude=sect.get("exclude", ""),
|
|
lstrip=sect.get("lstrip", ""),
|
|
rstrip=sect.get("rstrip", ""),
|
|
alt=alt,
|
|
)
|
|
|
|
def __getitem__(self, key):
|
|
if not key in self.fdict:
|
|
raise Exception(f"Unknown filter {key}")
|
|
return self.fdict[key]
|
|
|
|
def all(self):
|
|
return self.in_all
|
|
|
|
def keys(self):
|
|
return list(self.fdict.keys())
|
|
|
|
|
|
class FilterDef:
|
|
def __init__(self, extrakto, name, *, regex, exclude, lstrip, rstrip, alt):
|
|
self.extrakto = extrakto
|
|
self.name = name
|
|
self.regex = regex
|
|
self.exclude = exclude
|
|
self.lstrip = lstrip
|
|
self.rstrip = rstrip
|
|
self.alt = alt
|
|
|
|
def filter(self, text):
|
|
res = list()
|
|
if self.extrakto.prefix_name:
|
|
add = lambda name, value: res.append(f"{name}: {value}")
|
|
else:
|
|
add = lambda name, value: res.append(value)
|
|
|
|
for m in re.finditer(self.regex, "\n" + text, flags=re.I):
|
|
item = "".join(filter(None, m.groups()))
|
|
|
|
# strip invalid characters (like punctuation or markdown syntax)
|
|
if self.lstrip:
|
|
item = item.lstrip(self.lstrip)
|
|
if self.rstrip:
|
|
item = item.rstrip(self.rstrip)
|
|
|
|
if len(item) >= self.extrakto.min_length:
|
|
if not self.exclude or not re.search(self.exclude, item, re.I):
|
|
if self.extrakto.alt:
|
|
for i, altre in enumerate(self.alt):
|
|
m = re.search(altre, item)
|
|
if m:
|
|
add(f"{self.name}{i+2}", m[1])
|
|
add(self.name, item)
|
|
return res
|
|
|
|
|
|
def get_lines(text, *, min_length=5, prefix_name=False):
|
|
lines = []
|
|
|
|
for raw_line in text.splitlines():
|
|
line = raw_line.strip()
|
|
if len(line) >= min_length:
|
|
if prefix_name:
|
|
lines.append("line: " + line)
|
|
else:
|
|
lines.append(line)
|
|
|
|
return lines
|
|
|
|
|
|
def main(parser):
|
|
args = parser.parse_args()
|
|
|
|
run_list = []
|
|
if args.words:
|
|
run_list.append("word")
|
|
if args.paths:
|
|
run_list.append("path")
|
|
if args.urls:
|
|
run_list.append("url")
|
|
run_list += args.add
|
|
|
|
res = []
|
|
# input from the terminal can cause UnicodeDecodeErrors in some instances, ignore for now
|
|
text = sys.stdin.buffer.read().decode("utf-8", "ignore")
|
|
|
|
extrakto = Extrakto(min_length=args.min_length, alt=args.alt, prefix_name=args.name)
|
|
if args.all:
|
|
run_list = extrakto.all()
|
|
|
|
if args.lines:
|
|
res += get_lines(text, min_length=args.min_length, prefix_name=args.name)
|
|
|
|
for name in run_list:
|
|
res += extrakto[name].filter(text)
|
|
|
|
if res:
|
|
if args.reverse:
|
|
res.reverse()
|
|
|
|
# remove duplicates and print
|
|
for item in OrderedDict.fromkeys(res):
|
|
print(item)
|
|
|
|
elif args.warn_empty:
|
|
print("NO MATCH - use a different filter")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser(description="Extracts tokens from plaintext.")
|
|
|
|
parser.add_argument(
|
|
"--name", action="store_true", help="prefix filter name in the output"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-w", "--words", action="store_true", help='extract "word" tokens'
|
|
)
|
|
|
|
parser.add_argument("-l", "--lines", action="store_true", help="extract lines")
|
|
|
|
parser.add_argument(
|
|
"--all",
|
|
action="store_true",
|
|
help="extract using all filters defined in extrakto.conf",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-a", "--add", action="append", default=[], help="add custom filter"
|
|
)
|
|
|
|
parser.add_argument("-p", "--paths", action="store_true", help="short for -a=path")
|
|
|
|
parser.add_argument("-u", "--urls", action="store_true", help="short for -a=url")
|
|
|
|
parser.add_argument(
|
|
"--alt",
|
|
action="store_true",
|
|
help="return alternate variants for each match (e.g. https://example.com and example.com)",
|
|
)
|
|
|
|
parser.add_argument("-r", "--reverse", action="store_true", help="reverse output")
|
|
|
|
parser.add_argument(
|
|
"-m", "--min-length", default=5, help="minimum token length", type=int
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--warn-empty", action="store_true", help="warn if result is empty"
|
|
)
|
|
|
|
main(parser)
|