Source code for tabledataextractor.table.footnotes

# -*- coding: utf-8 -*-
"""
Footnote handling.

.. codeauthor:: Juraj Mavračić <[email protected]>

"""

import logging
import numpy as np
import re
from .parse import CellParser

log = logging.getLogger(__name__)


[docs]class Footnote:
    """
    Defines a footnote found in the provided table.
    Contains elements of a footnote.
    Will construct the footnote and find all associated elements.

    :param table: table to work on
    :type table: ~tabledataextractor.table.table.Table
    :param prefix: Prefix that has been identified as footnote prefix
    :type prefix: str
    :param prefix_cell: Index of the cell containing the associated prefix
    :type prefix_cell: (int, int)
    :param text: Optional. Text associated with the found footnote prefix
    :type text: str
    """

    def __init__(self, table, prefix, prefix_cell, text):
        self._table = table
        self.pre_cleaned_table = np.copy(self._table.pre_cleaned_table)

        #: Prefix string, e.g., `"a)"`.
        self.prefix = prefix

        #: Cell index of the prefix, e.g., `(7,0)`.
        self.prefix_cell = prefix_cell

        #: Cell of the footnote text, e.g., `(7,1)`.
        self.text_cell = self.prefix_cell if text else self._find_text_cell()

        #: Footnote text, e.g., `"This is the text of a footnote"`.
        self.text = text if text else self._find_text()

        #: Cell indexes of the cells containing the footnote references within the table.
        self.reference_cells = self._find_reference_cells()

        #: Cell content of the cells contatining the footnote references within the table.
        self.references = self._find_references()

    def _find_text_cell(self):
        """Finds the cell index containing the text associated with the prefix."""
        for column_index in range(self.prefix_cell[1] + 1, np.shape(self.pre_cleaned_table)[1]):
            if not self._table.pre_cleaned_table_empty[self.prefix_cell[0], column_index]:
                return self.prefix_cell[0], column_index
            return None

    def _find_text(self):
        """Finds the text associated with the prefix, only one cell can contain the text."""
        if self.text_cell is not None:
            return str(self.pre_cleaned_table[self.text_cell])
        else:
            return ""

    def _find_reference_cells(self):
        """
        Searches the entire table above each footnote for the previously detected footnote prefix.
        Updates the footnote-internal version of the `pre-cleaned table`, by cutting out the footnote prefix out of the reference cell.

        Rules for matching:

            1. if prefix is `number`:
                a) matches if `(anything)+space+prefix`
            2. if prefix is `a-z`:
                a) matches if `(anything)+space+prefix` OR
                b) matches if `prefix`
            3. else:
                a) matches if found anywhere in any cell

        :return: [(int,int)]

        """
        # indices of the references
        fn_refs = []

        # Case 1a If prefix is number, general
        if re.fullmatch(pattern='[\d]{1,2}', string=self.prefix):
            log.debug("Footnote prefix {} is number".format(self.prefix))
            fn_ref_parser_1a = CellParser('(^.+\s)(' + self.prefix + ')(\s.+)?$')
            for fn_ref in fn_ref_parser_1a.parse(self.pre_cleaned_table[0:self.prefix_cell[0]], method='match'):
                fn_refs.append(fn_ref[:2])
                stripped_text = fn_ref[2][0]
                stripped_text += self.text if self.text is not None else ""
                if fn_ref[2][2] is not None:
                    stripped_text += fn_ref[2][2]
                self.pre_cleaned_table[fn_ref[:2]] = stripped_text

        # Case 2a If prefix is a-z:
        elif re.fullmatch(pattern='[a-zA-Z]', string=self.prefix):
            log.debug("Footnote prefix {} is letter".format(self.prefix))
            fn_ref_parser_2a = CellParser('(^.+\s)(' + self.prefix + ')(\s.+)?$')
            for fn_ref in fn_ref_parser_2a.parse(self.pre_cleaned_table[0:self.prefix_cell[0]], method='match'):
                fn_refs.append(fn_ref[:2])
                stripped_text = fn_ref[2][0]
                stripped_text += self.text if self.text is not None else ""
                if fn_ref[2][2] is not None:
                    stripped_text += fn_ref[2][2]
                self.pre_cleaned_table[fn_ref[:2]] = stripped_text

            # Case 2b If prefix is a-z and alone in the cell
            fn_ref_parser_2b = CellParser('^(' + self.prefix + ')$')
            for fn_ref in fn_ref_parser_2b.parse(self.pre_cleaned_table[0:self.prefix_cell[0]], method='match'):
                log.debug("Footnote prefix {} is letter and is alone in cell.".format(self.prefix))
                fn_refs.append(fn_ref[:2])
                stripped_text = self.text if self.text is not None else ""
                self.pre_cleaned_table[fn_ref[:2]] = stripped_text

        # Case 3, everything else
        else:
            fn_ref_parser = CellParser('(' + re.escape(self.prefix) + ')')
            repl = " "+self.text+" " if self.text is not None else " "
            for fn_ref in fn_ref_parser.replace(self.pre_cleaned_table[0:self.prefix_cell[0]],
                                                repl=repl,
                                                method='search'):
                fn_refs.append(fn_ref[:2])
                stripped_text = fn_ref[2]
                self.pre_cleaned_table[fn_ref[:2]] = stripped_text

        return fn_refs

    def _find_references(self):
        """Collects the raw references, no cleanup"""
        references = []
        for cell in self.reference_cells:
            references.append(self._table.pre_cleaned_table[cell])
        return references

    def __str__(self):
        return "Prefix: {:4}   Text: {:60}   Ref. Cells: {}   " \
               "References: {}".format("'"+str(self.prefix)+"'",
                                       "'"+str(self.text)+"'",
                                       str(self.reference_cells),
                                       str(self.references))


[docs]def find_footnotes(table_object):
    """
    Finds a footnote and yields a :class:`~tabledataextractor.table.footnotes.Footnote` object with all the appropriate properties.
    A footnote is defined with::

        FNprefix  = \*, #, ., o, †; possibly followed by "." or ")"

    A search is performed only below the data region.

    :param table_object: Input Table object
    :type table_object: ~tabledataextractor.table.table.Table
    """
    #: finds a footnote cell that possibly contains some text as well
    fn_parser = CellParser(r'^([*#\.o†\da-z][\.\)]?)(?!\d)\s?(([\w\[\]\s\:]+)?\.?)\s?$')
    for fn in fn_parser.parse(table_object.pre_cleaned_table):
        if fn[0] > table_object._cc4[0]:
            footnote = Footnote(table_object, prefix=fn[2][0], prefix_cell=(fn[0], fn[1]), text=fn[2][1])
            yield footnote