#!/usr/bin/env python r''' NAME pdf-extract-tables file.pdf > out.txt SYNOPSIS pdf-extract-tables file.pdf > out.txt default output format: Options: -t cells tagged by column number (_5: value\n) col sep: "\n"; white values cells removed row sep: emptyline ("\n\n") -tn col sep: emptyline (\n\n); row sep: double emptyline (\n\n\n); -tn1 No tags (no "_5:", just the value) -q quiet: cell -s skip tables that appear reversed strings Description Gets ''' from jjcli import * import pdfplumber cl=clfilter(opt="1sqtnp:", doc=__doc__) ## option values in cl.opt dictionary # FIXME page ranges.... page = int(cl.opt.get("-p",7)) table_settings = { # "vertical_strategy": "lines", # "horizontal_strategy": "text", # "snap_y_tolerance": 5, # "intersection_x_tolerance": 15, } def guess_rev(tab): """ Try to find string reversed evidences (skip table?)""" newtab=[] for row in tab: r = [] for col in row: r.append( col.strip() if col else "") newtab.append(r) ex1 = [" ".join(x) for x in newtab] ex = " ".join( ex1 ) notok = len(findall(r' [,.!?;:][a-z]|[a-z][A-Z] |[a-z][cdfpqt] ', ex)) ok = len(findall(r'[a-z][,.!?;:] | [A-Z][a-z]| [cdfpqt][a-z]', ex)) ans = notok > ok + 3 if ans and "-s" in cl.opt: return [] elif ans and "-s" in cl.opt: tab=[] for row in newtab: r = [] for col in row: r.append( col[::-1].strip() if col else "") tab.append(r) return tab else: return tab def row2txt(row): if "-t" in cl.opt: r="" for i, x in enumerate(row, start=1): if "-1" in cl.opt: tag = "" else: tag = f"_{i}: " if not x : continue v = sub(r'\n', r'• ', str(x)) if v.isspace(): continue if "-n" in cl.opt: r += f"\n\n{tag}{v}" else: r += f"\n{tag}{v}" return r else: return " | ".join( map( lambda x: sub(r'\n', r'•', "" if x is None else str(x)),row)) for filename in cl.args: pdf = pdfplumber.open(filename) print( len( pdf.pages)) # go to the required page for i, p0 in enumerate(pdf.pages, start=1): # p0 = pdf.pages[page] # go to the required page tables = p0.extract_tables(table_settings) for j,table in enumerate(tables, start=1): print(f"\n#------------------------------pg-{i}, table-{j}-") tab=guess_rev(table) for row in tab: v1 = row2txt(row) if "-t" in cl.opt: v1 = sub(r'-• *', '', v1) v1 = sub(r'[• ]+', ' ', v1) print(v1) elif "-q" in cl.opt: v1 = sub(r'-• *', '', v1) v1 = sub(r'[• ]+', ' ', v1) print(f"\n{v1}") else: print(f"____\n{v1}") ''' # tables = p0.debug_tablefinder() # list of tables which pdfplumber identifies # req_table = tables.tables[0] # Suppose you want to use ith table cells = req_table.cells # gives list of all cells in that table print(cells) for i, cell in enumerate(cells): ## [i:j]: # iterating through the required cells print(i, p0.crop(cell).extract_words()) # extract the words '''