# Licensed under a 3-clause BSD style license - see LICENSE.rst """An extensible HTML table reader and writer. html.py: Classes to read and write HTML tables `BeautifulSoup `_ must be installed to read HTML tables. """ import warnings from copy import deepcopy from astropy.table import Column from astropy.utils.compat.optional_deps import HAS_BS4 from astropy.utils.xml import writer from . import core class SoupString(str): """ Allows for strings to hold BeautifulSoup data. """ def __new__(cls, *args, **kwargs): return str.__new__(cls, *args, **kwargs) def __init__(self, val): self.soup = val class ListWriter: """ Allows for XMLWriter to write to a list instead of a file. """ def __init__(self, out): self.out = out def write(self, data): self.out.append(data) def identify_table(soup, htmldict, numtable): """ Checks whether the given BeautifulSoup tag is the table the user intends to process. """ if soup is None or soup.name != "table": return False # Tag is not a elif "table_id" not in htmldict: return numtable == 1 table_id = htmldict["table_id"] if isinstance(table_id, str): return "id" in soup.attrs and soup["id"] == table_id elif isinstance(table_id, int): return table_id == numtable # Return False if an invalid parameter is given return False class HTMLInputter(core.BaseInputter): """ Input lines of HTML in a valid form. This requires `BeautifulSoup `_ to be installed. """ def process_lines(self, lines): """ Convert the given input into a list of SoupString rows for further processing. """ if not HAS_BS4: raise core.OptionalTableImportError( "BeautifulSoup must be installed to read HTML tables" ) from bs4 import BeautifulSoup if "parser" not in self.html: with warnings.catch_warnings(): # Ignore bs4 parser warning #4550. warnings.filterwarnings( "ignore", ".*no parser was explicitly specified.*" ) soup = BeautifulSoup("\n".join(lines)) else: # use a custom backend parser soup = BeautifulSoup("\n".join(lines), self.html["parser"]) tables = soup.find_all("table") for i, possible_table in enumerate(tables): if identify_table(possible_table, self.html, i + 1): table = possible_table # Find the correct table break else: if isinstance(self.html["table_id"], int): err_descr = f"number {self.html['table_id']}" else: err_descr = f"id '{self.html['table_id']}'" raise core.InconsistentTableError( f"ERROR: HTML table {err_descr} not found" ) # Get all table rows soup_list = [SoupString(x) for x in table.find_all("tr")] return soup_list class HTMLSplitter(core.BaseSplitter): """ Split HTML table data. """ def __call__(self, lines): """ Return HTML data from lines as a generator. """ for line in lines: if not isinstance(line, SoupString): raise TypeError("HTML lines should be of type SoupString") soup = line.soup header_elements = soup.find_all("th") if header_elements: # Return multicolumns as tuples for HTMLHeader handling yield [ (el.text.strip(), el["colspan"]) if el.has_attr("colspan") else el.text.strip() for el in header_elements ] data_elements = soup.find_all("td") if data_elements: yield [el.text.strip() for el in data_elements] if len(lines) == 0: raise core.InconsistentTableError( "HTML tables must contain data in a
tag" ) class HTMLOutputter(core.TableOutputter): """ Output the HTML data as an ``astropy.table.Table`` object. This subclass allows for the final table to contain multidimensional columns (defined using the colspan attribute of
). """ default_converters = [ core.convert_numpy(int), core.convert_numpy(float), core.convert_numpy(str), ] def __call__(self, cols, meta): """ Process the data in multidimensional columns. """ new_cols = [] col_num = 0 while col_num < len(cols): col = cols[col_num] if hasattr(col, "colspan"): # Join elements of spanned columns together into list of tuples span_cols = cols[col_num : col_num + col.colspan] new_col = core.Column(col.name) new_col.str_vals = list(zip(*[x.str_vals for x in span_cols])) new_cols.append(new_col) col_num += col.colspan else: new_cols.append(col) col_num += 1 return super().__call__(new_cols, meta) class HTMLHeader(core.BaseHeader): splitter_class = HTMLSplitter def start_line(self, lines): """ Return the line number at which header data begins. """ for i, line in enumerate(lines): if not isinstance(line, SoupString): raise TypeError("HTML lines should be of type SoupString") soup = line.soup if soup.th is not None: return i return None def _set_cols_from_names(self): """ Set columns from header names, handling multicolumns appropriately. """ self.cols = [] new_names = [] for name in self.names: if isinstance(name, tuple): col = core.Column(name=name[0]) col.colspan = int(name[1]) self.cols.append(col) new_names.append(name[0]) for i in range(1, int(name[1])): # Add dummy columns self.cols.append(core.Column("")) new_names.append("") else: self.cols.append(core.Column(name=name)) new_names.append(name) self.names = new_names class HTMLData(core.BaseData): splitter_class = HTMLSplitter def start_line(self, lines): """ Return the line number at which table data begins. """ for i, line in enumerate(lines): if not isinstance(line, SoupString): raise TypeError("HTML lines should be of type SoupString") soup = line.soup if soup.td is not None: if soup.th is not None: raise core.InconsistentTableError( "HTML tables cannot have headings and data in the same row" ) return i raise core.InconsistentTableError("No start line found for HTML data") def end_line(self, lines): """ Return the line number at which table data ends. """ last_index = -1 for i, line in enumerate(lines): if not isinstance(line, SoupString): raise TypeError("HTML lines should be of type SoupString") soup = line.soup if soup.td is not None: last_index = i if last_index == -1: return None return last_index + 1 class HTML(core.BaseReader): """HTML format table. In order to customize input and output, a dict of parameters may be passed to this class holding specific customizations. **htmldict** : Dictionary of parameters for HTML input/output. * css : Customized styling If present, this parameter will be included in a