Source code for tabledataextractor.input.from_html

# -*- coding: utf-8 -*-
"""
Reads an `html` formatted table.
"""


import numpy as np
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.ie.options import Options as IeOptions
import copy
import logging
from tabledataextractor.exceptions import InputError

log = logging.getLogger(__name__)


[docs]def makearray(html_table):
    """
    Creates a numpy array from an `.html` file, taking `rowspan` and `colspan` into account.

    Modified from:
        John Ricco, https://johnricco.github.io/2017/04/04/python-html/, *Using Python to scrape HTML tables with merged cells*

    Added functionality for duplicating cell content for cells with `rowspan`/`colspan`.
    The table has to be :math:`n*m`, rectangular, with the same number of columns in every row.
    """
    n_cols = 0
    n_rows = 0

    for row in html_table.findAll("tr"):
        col_tags = row.find_all(["td", "th"])
        if len(col_tags) > 0:
            n_rows += 1
            if len(col_tags) > n_cols:
                n_cols = len(col_tags)

    # according to numpy documentation fill_value should be of type Union[int, float, complex]
    # however, 'str' works just fine
    array = np.full((n_rows, n_cols), fill_value="", dtype='<U60')

    # list to store rowspan values
    skip_index = [0 for i in range(0, n_cols)]

    # iterating over each row in the table
    row_counter = 0
    for row in html_table.findAll("tr"):

        # skip row if it's empty
        if len(row.find_all(["td", "th"])) == 0:
            continue

        else:

            # get all the cells containing data in this row
            columns = row.find_all(["td", "th"])
            col_dim = []
            row_dim = []
            col_dim_counter = -1
            row_dim_counter = -1
            col_counter = -1
            this_skip_index = copy.deepcopy(skip_index)

            for col in columns:

                # determine all cell dimensions
                colspan = col.get("colspan")
                if not colspan:
                    col_dim.append(1)
                else:
                    col_dim.append(int(colspan))
                col_dim_counter += 1

                rowspan = col.get("rowspan")
                if not rowspan:
                    row_dim.append(1)
                else:
                    row_dim.append(int(rowspan))
                row_dim_counter += 1

                # adjust column counter
                if col_counter == -1:
                    col_counter = 0
                else:
                    col_counter = col_counter + col_dim[col_dim_counter - 1]

                while skip_index[col_counter] > 0:
                    col_counter += 1

                # get cell contents
                cell_data = col.get_text()

                # insert data into cell
                array[row_counter, col_counter] = cell_data

                # Insert data into neighbouring rowspan/colspan cells
                if colspan:
                    for spanned_col in range(col_counter+1, col_counter + int(colspan)):
                        array[row_counter, spanned_col] = cell_data
                if rowspan:
                    for spanned_row in range(row_counter+1, row_counter + int(rowspan)):
                        array[spanned_row, col_counter] = cell_data

                #record column skipping index
                if row_dim[row_dim_counter] > 1:
                    this_skip_index[col_counter] = row_dim[row_dim_counter]

        # adjust row counter
        row_counter += 1

        # adjust column skipping index
        skip_index = [i - 1 if i > 0 else i for i in this_skip_index]

    return array


[docs]def read_file(file_path, table_number=1):
    """Reads an .html file and returns a numpy array."""
    file = open(file_path, encoding='UTF-8')
    html_soup = BeautifulSoup(file, features='lxml')
    file.close()
    html_table = html_soup.find_all("table")[table_number-1]
    array = makearray(html_table)
    return array


[docs]def configure_selenium(browser='Firefox'):
    """
    Configuration for `Selenium <https://selenium-python.readthedocs.io/>`_. Sets the path to ``geckodriver.exe``

    :param browser: Which browser to use
    :type browser: str
    :return: Selenium driver

    """
    if browser == 'Firefox':
        options = FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(options=options, executable_path=r'C:\Users\juras\System\geckodriver\geckodriver.exe')
        return driver
    else:
        return None


[docs]def read_url(url, table_number=1):
    """
    Reads in a table from an URL and returns a numpy array. Will try `Requests <http://docs.python-requests.org/en/master/>`_ first. If it doesn't succeed, `Selenium <https://selenium-python.readthedocs.io/>`_ will be used.

    :param url: Url of the page where the table is located
    :type url: str
    :param table_number: Number of Table on the web page.
    :type table_number: int
    """

    if not isinstance(table_number, int):
        msg = 'Table number is not valid.'
        log.critical(msg)
        raise TypeError(msg)

    # first try the requests package, if it fails do the selenium, which is much slower
    try:
        html_file = requests.get(url)
        html_soup = BeautifulSoup(html_file.text, features='lxml')
        html_table = html_soup.find_all("table")[table_number - 1]
        array = makearray(html_table)
        log.info("Package 'requests' was used.")
        return array
    except Exception:
        driver = configure_selenium()
        driver.get(url)
        html_file = driver.page_source
        html_soup = BeautifulSoup(html_file, features='lxml')
        try:
            html_table = html_soup.find_all("table")[table_number-1]
        except IndexError:
            raise InputError("table_number={} is out of range".format(table_number))
        else:
            array = makearray(html_table)
            log.info("Package 'selenium' was used.")
            return array