Source code for tabledataextractor.table.table

# -*- coding: utf-8 -*-
"""
Represents a table in a highly standardized format.

.. codeauthor:: Juraj Mavračić <[email protected]>

"""

import logging
import numpy as np

from tabledataextractor.input import from_any
from tabledataextractor.output.print import as_string, print_table, list_as_PrettyTable
from tabledataextractor.output.to_csv import write_to_csv
from tabledataextractor.output.to_pandas import to_pandas, build_category_table
from tabledataextractor.table.parse import StringParser
from tabledataextractor.exceptions import InputError, MIPSError, TDEError
from tabledataextractor.table.history import History
from tabledataextractor.table.algorithms import find_cc1_cc2, find_cc3, find_cc4, prefix_duplicate_labels, \
    duplicate_spanning_cells, header_extension_up, find_title_row, find_note_cells, empty_cells, \
    pre_clean, split_table, standardize_empty, header_extension_down, find_row_header_table, clean_row_header
from tabledataextractor.table.footnotes import find_footnotes

log = logging.getLogger(__name__)


[docs]class Table: """ Main `TableDataExtractor` object that includes the raw (input), cleaned (processes) and labelled tables. Represents the table input (.csv, .html, python list, url) in a highly standardized `category table` format, using the MIPS (*Minimum Indexing Point Search*) algorithm. Optional configuration keywords (defaults): * ``use_title_row = True`` A title row will be assumed if possible. * ``use_prefixing = True`` Will perform the prefixing steps if row or column index cells are not unique. * ``use_spanning_cells = True`` Will duplicate spanning cells in the row and column header regions if needed. * ``use_header_extension = True`` Will extend the row and column header beyond the MIPS-defined headers, if needed. * ``use_footnotes = True`` Will copy the footnote text into the appropriate cells of the table and remove the footnote prefix. * ``use_max_data_area = False`` If `True` the max data area will be used to determine the cell `CC2` in the main MIPS algorithm. It is probably never necessary to set this to True. * ``standardize_empty_data = True`` Will standardize empty cells in the `data` region to 'NoValue' * ``row_header = None`` If an integer is given, it indicates the index of `row_header` columns. This overwrites the MIPS algorithm. For example, ``row_header = 0`` will make only the first column a row header. * ``col_header = None`` If an integer is given, it indicates the index of `col_header` rows. This overwrites the MIPS algorithm. For example, ``col_header = 0`` will make only the first row a column header. :param file_path: Path to .html or .cvs file, URL or list object that is used as input :type file_path: str | list :param table_number: Number of table to read, if there are several at the given url, or in the html file :type table_number: int """ def __init__(self, file_path, table_number=1, **kwargs): """Runs required `TableDataExtractor` algorithms automatically upon initialization.""" log.info('Initialization of table: "{}"'.format(file_path)) self._file_path = file_path self._table_number = table_number self._configs = self._set_configs(**kwargs) self._history = History() self._analyze_table() @property def _default_configs(self): return {'use_title_row': True, 'use_prefixing': True, 'use_footnotes': True, 'use_spanning_cells': True, 'use_header_extension': True, 'use_max_data_area': False, 'standardize_empty_data': True, 'row_header': None, 'col_header': None} def _analyze_table(self): """ Performs the analysis of the input table and is run automatically on initialization of the table object. """ # check if input array is empty if empty_cells(self.raw_table).all(): msg = 'Input table is empty.' log.critical(msg) raise InputError(msg) # clean-up the input array self._pre_cleaned_table = pre_clean(self.raw_table) log.debug("Table shape changed from {} to {}.".format(np.shape(self.raw_table), np.shape(self.pre_cleaned_table))) if self.configs['use_spanning_cells']: self._pre_cleaned_table = duplicate_spanning_cells(self, self._pre_cleaned_table) if self.configs['use_prefixing']: self._pre_cleaned_table = prefix_duplicate_labels(self, self._pre_cleaned_table) # footnotes handling self._footnotes = [] for footnote in find_footnotes(self): self._footnotes.append(footnote) if self.configs['use_footnotes']: self._copy_footnotes(footnote) # Main MIPS algorithm, finding the data and header regions try: #: Critical cells `CC1` and `CC2` self._cc1, self._cc2 = find_cc1_cc2(self, self._cc4, self._pre_cleaned_table) except (MIPSError, TypeError): msg = "ERROR: Main MIPS Algorithm failed. Maybe the input table is bad!" log.critical(msg) raise MIPSError(msg) else: log.debug("Table Cell CC1 = {}; Table Cell CC2 = {}".format(self._cc1, self._cc2)) if self.configs['use_header_extension']: self._cc1 = header_extension_up(self, self._cc1) self._cc2 = header_extension_down(self, self._cc1, self._cc2, self._cc4) log.debug("Header extension, new cc1 = {}, new cc2 = {}".format(self._cc1, self._cc2)) # check if critical cell `CC3` can be found try: _ = self._cc3 except MIPSError: raise @property def footnotes(self): """ List of footnotes in the table. Each footnote is an instance of :class:`~tabledataextractor.table.footnotes.Footnote`. :type: list[~tabledataextractor.table.footnotes.Footnote] """ return self._footnotes @property def title_row(self): """ Title row of the table. :type: list """ if self._configs['use_title_row']: return find_title_row(self) @property def history(self): """ Indicates which algorithms have been applied to the table by TableDataExtractor. :type: ~tabledataextractor.table.history.History """ return self._history @property def labels(self): """ Cell labels. :type: list """ temp = np.empty_like(self._pre_cleaned_table, dtype="<U60") temp[:, :] = '/' if self.configs['use_title_row']: temp[self.title_row, :] = 'TableTitle' temp[self._cc1[0]:self._cc2[0] + 1, self._cc1[1]:self._cc2[1] + 1] = 'StubHeader' temp[self._cc3[0]:self._cc4[0] + 1, self._cc1[1]:self._cc2[1] + 1] = 'RowHeader' temp[self._cc1[0]:self._cc2[0] + 1, self._cc3[1]:self._cc4[1] + 1] = 'ColHeader' temp[self._cc3[0]:self._cc4[0] + 1, self._cc3[1]:self._cc4[1] + 1] = 'Data' for footnote in self.footnotes: temp[footnote.prefix_cell[0], footnote.prefix_cell[1]] = 'FNprefix' if footnote.text_cell is not None: temp[footnote.text_cell[0], footnote.text_cell[1]] = 'FNtext' if \ temp[footnote.text_cell[0], footnote.text_cell[1]] == '/' else 'FNprefix & FNtext' for ref_cell in footnote.reference_cells: temp[ref_cell[0], ref_cell[1]] = 'FNref' if temp[ref_cell[0], ref_cell[1]] == '/' else \ temp[ref_cell[0], ref_cell[1]] + ' & FNref' # all non-empty unlabelled cells at this point are labelled 'Note' for note_cell in find_note_cells(self, temp): temp[note_cell] = 'Note' return temp @property def configs(self): """ Configuration keywords set at the creation of the :class:`~tabledataextractor.table.table.Table` instance. :type: dict """ return self._configs @property def raw_table(self): """ Input table, as provided to `TableDataExtractor`. :type: numpy.array """ try: temp = from_any.create_table(self._file_path, self._table_number) except TypeError: raise else: assert isinstance(temp, np.ndarray) and temp.dtype == '<U60' if temp.ndim == 1: msg = 'Input table has only one row or column.' log.critical(msg) raise InputError(msg) if not self.history.table_transposed: return temp else: return temp.T @property def pre_cleaned_table(self): """ Cleaned-up table. This table is used for labelling the table regions, finding data-cells and building the category table. :type: numpy.array """ return self._pre_cleaned_table @property def pre_cleaned_table_empty(self): """ Mask array with `True` for all empty cells of the ``pre_cleaned_table``. :type: numpy.array """ return empty_cells(self._pre_cleaned_table) @property def category_table(self): """ Standardized table, where each row corresponds to a single data point of the original table. The columns are the row and column categories where the data point belongs to. :type: list """ if self._cc1 and self._cc2 and self._cc3 and self._cc4: return build_category_table(to_pandas(self)) else: msg = "Category table not built. Critical cells have not been found." raise MIPSError(msg) @property def col_header(self): """ Column header of the table. :type: numpy.ndarray """ if self._cc1 and self._cc2 and self._cc3 and self._cc4: return self._pre_cleaned_table[self._cc1[0]:self._cc2[0] + 1, self._cc3[1]:self._cc4[1] + 1] else: msg = "No column header. Critical cells have not been found." raise MIPSError(msg) @property def row_header(self): """ Row header of the table. :type: numpy.ndarray """ if self._cc1 and self._cc2 and self._cc3 and self._cc4: return self._pre_cleaned_table[self._cc3[0]:self._cc4[0] + 1, self._cc1[1]:self._cc2[1] + 1] else: msg = "No row header. Critical cells have not been found." raise MIPSError(msg) @property def stub_header(self): """ Stub header of the table. :type: numpy.ndarray """ if self._cc1 and self._cc2 and self._cc3 and self._cc4: return self._pre_cleaned_table[self._cc1[0]:self._cc2[0] + 1, self._cc1[1]:self._cc2[1] + 1] else: msg = "No stub header. Critical cells have not been found." raise MIPSError(msg) @property def data(self): """ Data region of the table. :type: numpy.ndarray """ if self._cc1 and self._cc2 and self._cc3 and self._cc4: data_region = self._pre_cleaned_table[self._cc3[0]:self._cc4[0] + 1, self._cc3[1]:self._cc4[1] + 1] if self.configs['standardize_empty_data']: data_region = standardize_empty(data_region) return data_region else: msg = "No data region. Critical cells have not been found." raise MIPSError(msg) @property def subtables(self): """ List of all subtables. Each subtable is an instance of :class:`~tabledataextractor.table.table.Table`. :type: list[~tabledataextractor.table.table.Table] """ tables = [] g = split_table(self) while True: subtable = next(g, None) if subtable is None: break else: try: tables.append(Table(subtable)) except MIPSError as e: log.exception("Subtable MIPS failure {}".format(e.args)) break return tables @property def row_categories(self): """ Table where the original stub header is the first row(s) and all subsequent rows are the row categories of the original table. The assumption is made that the stub header labels row categories (that is, cells below the stub header). The `row_categories` table can be used if the row categories want to be analyzed as `data` themselves, which can occur if the header regions of the original table intentionally have duplicate elements. :type: ~tabledataextractor.table.table.TrivialTable """ # this outer try statement is necessary to catch some weird errors with empty category tables try: if len(self.stub_header.T) != 0 and len(self.stub_header.T) == len(self.category_table[0][1]): raw_table = find_row_header_table(self.category_table, self.stub_header) try: table = TrivialTable(raw_table, clean_row_header=True, row_header=0, col_header=len(self.stub_header) - 1) except TDEError as e: return None if not empty_cells(table.data).any(): return table else: return None except IndexError: return None
[docs] def contains(self, pattern): """ Returns true if table contains a particular string. :param pattern: Regular expression for input :return: True/False """ parser = StringParser(pattern) for row in self.category_table: string = row[0] + ' ' string += ' '.join(row[1]) + ' ' string += ' '.join(row[2]) if parser.parse(string, method='search'): return True return False
[docs] def transpose(self): """ Transposes the `Table` and performs the analysis again. In this way, if working interactively from a `Jupyter` notebook, it is possible to input a table and then transpose it to see how it looks like and if the results of the standardization are different. """ self._history = History() self.history._table_transposed = True self._analyze_table()
@property def _cc4(self): """Critical cell `CC4`.""" return find_cc4(self) @property def _cc3(self): """Critical cell `CC3`.""" return find_cc3(self, self._cc2) def _set_configs(self, **kwargs): """Sets the configuration parameters based on the user input.""" configs = self._default_configs for key, value in kwargs.items(): if key in self._default_configs: configs[key] = value else: msg = 'Keyword "{}" does not exist.'.format(key) log.critical(msg) raise InputError(msg) log.info('Configuration parameters are: {}'.format(configs)) return configs def _copy_footnotes(self, footnote): """ Updates the pre-cleaned table with updated reference cells for a given footnote. """ if not np.array_equal(self._pre_cleaned_table, footnote.pre_cleaned_table): self._pre_cleaned_table = np.copy(footnote.pre_cleaned_table) self.history._footnotes_copied = True log.debug("METHOD. Footnotes copied into cells.")
[docs] def print(self): """ Prints the `raw table` (input), `cleaned table` (processed by `TableDataExtractor`) and `labels` (regions of the table) nicely. """ log.debug("Printing table: {}".format(self._file_path)) print_table(self.raw_table) print_table(self._pre_cleaned_table) print_table(self.labels)
[docs] def print_raw_table(self): """Prints raw input table nicely.""" print_table(self.raw_table)
[docs] def to_csv(self, file_path): """Saves the `raw_table` to a `.csv` file.""" log.info("Saving raw table to .csv to file: {}".format(self._file_path)) write_to_csv(self.raw_table, file_path=file_path)
[docs] def to_pandas(self): """ Converts the `Table` into a `Pandas DataFrame`, taking the complex MultiIndex structure of the table into account. :return: pandas.DataFrame """ log.info("Converting table to Pandas DataFrame: {}".format(self._file_path)) return to_pandas(self)
def __str__(self): """As the user wants to see it""" log.debug("Printing table: {}".format(self._file_path)) t = list_as_PrettyTable(self.category_table) return str(t) def __repr__(self): """As the developer wants to see it""" intro = "Table({}, table_number={}, transposed={})".format(self._file_path, self._table_number, self.history.table_transposed) log.debug("Repr. table: {}".format(self._file_path)) array_width = np.shape(self._pre_cleaned_table)[1] input_string = as_string(self.raw_table) results_string = as_string( np.concatenate((self._pre_cleaned_table, np.full((1, array_width), "", dtype='<U60'), self.labels))) t = list_as_PrettyTable(self.category_table) return intro + "\n\n" + input_string + results_string + str(t)
[docs]class TrivialTable(Table): """ Trivial Table object. No high level analysis will be performed. MIPS algorithm is never run. This table doesn't have footnotes, a title row or subtables. Optional configuration keywords (defaults): * ``standardize_empty_data = False`` Will standardize empty cells in the `data` region to 'NoValue'. * ``clean_row_header = False`` Removes duplicate rows that span the whole table (all columns). * ``row_header = 0`` The column up to which the row header is defined. * ``col_header = 0`` The row up to which the column header is defined. """ def __init__(self, file_path, table_number=1, **kwargs): super().__init__(file_path=file_path, table_number=table_number, **kwargs) @property def _default_configs(self): return {'standardize_empty_data': False, 'clean_row_header': False, 'row_header': 0, 'col_header': 0} def _analyze_table(self): """ Performs the analysis of the input table and is run automatically on initialization of the table object. """ # check if input array is empty if empty_cells(self.raw_table).all(): msg = 'Input table is empty.' log.critical(msg) raise InputError(msg) # define critical cells cc1 and cc2 (no MIPS algorithm is used in TrivialTable) self._cc1, self._cc2 = (0, 0), (self.configs['col_header'], self.configs['row_header']) log.debug("Table Cell CC1 = {}; Table Cell CC2 = {}".format(self._cc1, self._cc2)) self._pre_cleaned_table = self.raw_table if self.configs['clean_row_header']: self._pre_cleaned_table = clean_row_header(self.pre_cleaned_table, self._cc2) @property def _cc4(self): """Critical cell `CC4`.""" return len(self.pre_cleaned_table)-1, len(self.pre_cleaned_table.T)-1 @property def _cc3(self): """Critical cell `CC3`.""" if len(self.pre_cleaned_table.T) == 1: return self._cc2[0]+1, self._cc2[1] elif len(self.pre_cleaned_table) == 1: return self._cc2[0], self._cc2[1]+1 else: return self._cc2[0]+1, self._cc2[1]+1 @property def labels(self): """ Cell labels. :type: numpy.array """ temp = np.empty_like(self._pre_cleaned_table, dtype="<U60") temp[:, :] = '/' temp[self._cc1[0]:self._cc2[0] + 1, self._cc1[1]:self._cc2[1] + 1] = 'StubHeader' temp[self._cc3[0]:self._cc4[0] + 1, self._cc1[1]:self._cc2[1] + 1] = 'RowHeader' temp[self._cc1[0]:self._cc2[0] + 1, self._cc3[1]:self._cc4[1] + 1] = 'ColHeader' temp[self._cc3[0]:self._cc4[0] + 1, self._cc3[1]:self._cc4[1] + 1] = 'Data' return temp @property def col_header(self): """ Column header of the table. :type: numpy.ndarray """ if self._critical_cells and self._cc3[0] > self._cc2[0]: return self.pre_cleaned_table[self._cc1[0]:self._cc2[0] + 1, self._cc3[1]:self._cc4[1] + 1] elif self._critical_cells: return np.full_like(self.pre_cleaned_table[self._cc1[0]:self._cc2[0] + 1, self._cc3[1]:self._cc4[1] + 1], fill_value='', dtype='<U60') else: return None @property def row_header(self): """ Row header of the table. Enables a one-column table. :type: numpy.ndarray """ if self._critical_cells and self._cc3[1] > self._cc2[1]: return self.pre_cleaned_table[self._cc3[0]:self._cc4[0] + 1, self._cc1[1]:self._cc2[1] + 1] elif self._critical_cells: return np.full_like(self.pre_cleaned_table[self._cc3[0]:self._cc4[0] + 1, self._cc1[1]:self._cc2[1] + 1], fill_value='', dtype='<U60') else: return None @property def _critical_cells(self): """Indicates if all the critical cells have been found.""" if self._cc1 and self._cc2 and self._cc3 and self._cc4: return True else: return False @property def footnotes(self): """None""" return None @property def title_row(self): """None""" return None @property def subtables(self): """None""" return None