Source code for tabledataextractor.table.footnotes
# -*- coding: utf-8 -*-
"""
Footnote handling.
.. codeauthor:: Juraj Mavračić <[email protected]>
"""
import logging
import numpy as np
import re
from .parse import CellParser
log = logging.getLogger(__name__)
[docs]class Footnote:
"""
Defines a footnote found in the provided table.
Contains elements of a footnote.
Will construct the footnote and find all associated elements.
:param table: table to work on
:type table: ~tabledataextractor.table.table.Table
:param prefix: Prefix that has been identified as footnote prefix
:type prefix: str
:param prefix_cell: Index of the cell containing the associated prefix
:type prefix_cell: (int, int)
:param text: Optional. Text associated with the found footnote prefix
:type text: str
"""
def __init__(self, table, prefix, prefix_cell, text):
self._table = table
self.pre_cleaned_table = np.copy(self._table.pre_cleaned_table)
#: Prefix string, e.g., `"a)"`.
self.prefix = prefix
#: Cell index of the prefix, e.g., `(7,0)`.
self.prefix_cell = prefix_cell
#: Cell of the footnote text, e.g., `(7,1)`.
self.text_cell = self.prefix_cell if text else self._find_text_cell()
#: Footnote text, e.g., `"This is the text of a footnote"`.
self.text = text if text else self._find_text()
#: Cell indexes of the cells containing the footnote references within the table.
self.reference_cells = self._find_reference_cells()
#: Cell content of the cells contatining the footnote references within the table.
self.references = self._find_references()
def _find_text_cell(self):
"""Finds the cell index containing the text associated with the prefix."""
for column_index in range(self.prefix_cell[1] + 1, np.shape(self.pre_cleaned_table)[1]):
if not self._table.pre_cleaned_table_empty[self.prefix_cell[0], column_index]:
return self.prefix_cell[0], column_index
return None
def _find_text(self):
"""Finds the text associated with the prefix, only one cell can contain the text."""
if self.text_cell is not None:
return str(self.pre_cleaned_table[self.text_cell])
else:
return ""
def _find_reference_cells(self):
"""
Searches the entire table above each footnote for the previously detected footnote prefix.
Updates the footnote-internal version of the `pre-cleaned table`, by cutting out the footnote prefix out of the reference cell.
Rules for matching:
1. if prefix is `number`:
a) matches if `(anything)+space+prefix`
2. if prefix is `a-z`:
a) matches if `(anything)+space+prefix` OR
b) matches if `prefix`
3. else:
a) matches if found anywhere in any cell
:return: [(int,int)]
"""
# indices of the references
fn_refs = []
# Case 1a If prefix is number, general
if re.fullmatch(pattern='[\d]{1,2}', string=self.prefix):
log.debug("Footnote prefix {} is number".format(self.prefix))
fn_ref_parser_1a = CellParser('(^.+\s)(' + self.prefix + ')(\s.+)?$')
for fn_ref in fn_ref_parser_1a.parse(self.pre_cleaned_table[0:self.prefix_cell[0]], method='match'):
fn_refs.append(fn_ref[:2])
stripped_text = fn_ref[2][0]
stripped_text += self.text if self.text is not None else ""
if fn_ref[2][2] is not None:
stripped_text += fn_ref[2][2]
self.pre_cleaned_table[fn_ref[:2]] = stripped_text
# Case 2a If prefix is a-z:
elif re.fullmatch(pattern='[a-zA-Z]', string=self.prefix):
log.debug("Footnote prefix {} is letter".format(self.prefix))
fn_ref_parser_2a = CellParser('(^.+\s)(' + self.prefix + ')(\s.+)?$')
for fn_ref in fn_ref_parser_2a.parse(self.pre_cleaned_table[0:self.prefix_cell[0]], method='match'):
fn_refs.append(fn_ref[:2])
stripped_text = fn_ref[2][0]
stripped_text += self.text if self.text is not None else ""
if fn_ref[2][2] is not None:
stripped_text += fn_ref[2][2]
self.pre_cleaned_table[fn_ref[:2]] = stripped_text
# Case 2b If prefix is a-z and alone in the cell
fn_ref_parser_2b = CellParser('^(' + self.prefix + ')$')
for fn_ref in fn_ref_parser_2b.parse(self.pre_cleaned_table[0:self.prefix_cell[0]], method='match'):
log.debug("Footnote prefix {} is letter and is alone in cell.".format(self.prefix))
fn_refs.append(fn_ref[:2])
stripped_text = self.text if self.text is not None else ""
self.pre_cleaned_table[fn_ref[:2]] = stripped_text
# Case 3, everything else
else:
fn_ref_parser = CellParser('(' + re.escape(self.prefix) + ')')
repl = " "+self.text+" " if self.text is not None else " "
for fn_ref in fn_ref_parser.replace(self.pre_cleaned_table[0:self.prefix_cell[0]],
repl=repl,
method='search'):
fn_refs.append(fn_ref[:2])
stripped_text = fn_ref[2]
self.pre_cleaned_table[fn_ref[:2]] = stripped_text
return fn_refs
def _find_references(self):
"""Collects the raw references, no cleanup"""
references = []
for cell in self.reference_cells:
references.append(self._table.pre_cleaned_table[cell])
return references
def __str__(self):
return "Prefix: {:4} Text: {:60} Ref. Cells: {} " \
"References: {}".format("'"+str(self.prefix)+"'",
"'"+str(self.text)+"'",
str(self.reference_cells),
str(self.references))
[docs]def find_footnotes(table_object):
"""
Finds a footnote and yields a :class:`~tabledataextractor.table.footnotes.Footnote` object with all the appropriate properties.
A footnote is defined with::
FNprefix = \*, #, ., o, †; possibly followed by "." or ")"
A search is performed only below the data region.
:param table_object: Input Table object
:type table_object: ~tabledataextractor.table.table.Table
"""
#: finds a footnote cell that possibly contains some text as well
fn_parser = CellParser(r'^([*#\.o†\da-z][\.\)]?)(?!\d)\s?(([\w\[\]\s\:]+)?\.?)\s?$')
for fn in fn_parser.parse(table_object.pre_cleaned_table):
if fn[0] > table_object._cc4[0]:
footnote = Footnote(table_object, prefix=fn[2][0], prefix_cell=(fn[0], fn[1]), text=fn[2][1])
yield footnote