#!/usr/bin/python3 """ bs-grep -- beautifull soup grep Usage: bs-grep regexp file.html or URL prints Option: -a 5 get all elems numbered 5 in path (in text) -a h5 j5 get all elems numbered 5 in path (in html / json) -m 5 get 5 occurencies of the pattern (def all) -a 2.5 prints find_all ( tag(2.5), attrs = atrs(2.5)) -d 2.5 delete ( find_all ( tag(2.5), attrs = atrs(2.5))) -h show menu of the last paths ---- todo --dc extract Dublin Core metadate """ import json import jjcli import re import requests from bs4 import BeautifulSoup as bs def txt_compact(t): return "\n".join(re.split(r'\s*\n\s*',t.strip())) def path(a): x = a pa=[("txt",str(a))] while x := x.find_parent(): if x.name == "html": break if x.name == "body": continue pa.append( (x.name, x.attrs) ) return list(reversed(pa)) def showpath(pa, prefix="" ): print("===") for i, (nam, att) in enumerate(pa): print(f"{prefix}{i}\t{nam}\t{att}") def showpalist(palist, a="", prefix="" ): if a : print(a) for j, pa in enumerate(palist): print("===") for i, (tag, att) in enumerate(pa): print(f"{prefix}{j+1}.{i}\t{tag}\t{att}") def savepath(a): with open("__bs-grep", "w", encoding="utf8") as f: s = json.dumps(a, ensure_ascii=False) s = re.sub(r', \[', r',\n[', s) s = re.sub(r'\[\[', r'\n[[', s) print(s, file=f) def loadpath(): with open('__bs-grep', 'r', encoding="utf8") as f: palist = json.load(f) palist.insert(0, [("article", {}), ("meta", {}) ]) return palist def opth(): palist = loadpath() showpalist(palist) exit(0) def arg_extract(palist, cl): """ Extract ind, get, ???, from opt and palist """ query={"fmt": "txt", "a": {} , "del" : [], "b": None } if opta := cl.opt.get("-a"): if mat := re.match(r'([htj]?)(\d+)(?:\.(\d+))?', opta): get = int(mat[3]) if mat[3] else 1 ind = int(mat[2]) -1 if mat[2] else 0 pa = [] if len(palist) > ind : pa = palist[ind ] tag,attr = pa[get] if get < len(pa) else ("article", {} ) if not mat[1] or mat[1] == "t" : query["fmt"] = "txt" elif mat[1] == "h" : query["fmt"] = "html" elif mat[1] == "j" : query["fmt"] = "json" query["a"] = {"tag": tag, "attr": attr } else: jjcli.die(f"Invalid '-a' {opta} value") if optd := cl.opt.get("-d"): if mat := re.match(r'(h?)(\d+)(?:\.(\d+))?', optd): get = int(mat[3]) if mat[3] else 1 ind = int(mat[2]) - 1 pa = [] if len(palist) > ind : pa = palist[ind ] tag,attr = pa[get] if get < len(pa) else jjcli.die(f"Invalid '-d' [{ind+1}]") query["del"] = [{"tag": tag, "attr": attr } ] ## FIXME: multiple -d else: jjcli.die(f"Invalid '-d' {optd} value2") return query def extract(dt, query): for ele in query.get("del",[]): dels = dt.find_all(ele.get("tag",True), attrs=ele.get("attr",{} ) ) for d in dels: d.decompose() elems = dt.find_all(query["a"].get("tag",True), attrs=query["a"].get("attr",{} ) ) if query["fmt"] == "txt": for elem in elems: print(f"===================================\n{ txt_compact(elem.text) }" ) elif query["fmt"] == "html": for elem in elems: print(f"===================================\n{ elem }" ) elif query["fmt"] == "json": print(json.dumps([txt_compact(elem.text) for elem in elems], ensure_ascii=False, indent=3)) else: jjcli.die("FIXME: unknown format") def main(): cl = jjcli.clfilter("r:om:a:hb:d:",doc=__doc__) m = int(cl.opt.get("-m",20)) ## max match to show patt = None if "-h" in cl.opt: opth() if "-a" in cl.opt: palist = loadpath() query = arg_extract(palist, cl) elif len(cl.args) >= 1: patt = cl.args.pop(0) re_patt = re.compile(patt, flags=re.I) for html in cl.text(): cont = 0; dt = bs(html, 'lxml') if patt: ## using patt, m allpath = [] alist = dt.find_all(string=re_patt, limit=m ) for i, a in enumerate(alist): pa = path(a) allpath.append(pa) showpath( pa, f"{i+1}.") savepath(allpath) elif "-a" in cl.opt: extract(dt, query) main() # url = "https://www.atlasdasaude.pt/doencasAaZ"