# Licensed under a 3-clause BSD style license - see LICENSE.rst """ This module tests some of the methods related to the ``HTML`` reader/writer and aims to document its functionality. Requires `BeautifulSoup `_ to be installed. """ import os from io import StringIO from pathlib import Path import numpy as np import pytest from astropy.io import ascii from astropy.io.ascii import core, html from astropy.table import Table from astropy.utils.compat.optional_deps import HAS_BLEACH, HAS_BS4 from .common import setup_function, teardown_function # noqa: F401 if HAS_BS4: from bs4 import BeautifulSoup, FeatureNotFound @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_soupstring(): """ Test to make sure the class SoupString behaves properly. """ soup = BeautifulSoup( "

foo

", "html.parser" ) soup_str = html.SoupString(soup) assert isinstance(soup_str, str) assert isinstance(soup_str, html.SoupString) assert soup_str == "

foo

" assert soup_str.soup is soup def test_listwriter(): """ Test to make sure the class ListWriter behaves properly. """ lst = [] writer = html.ListWriter(lst) for i in range(5): writer.write(i) for ch in "abcde": writer.write(ch) assert lst == [0, 1, 2, 3, 4, "a", "b", "c", "d", "e"] @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_identify_table(): """ Test to make sure that identify_table() returns whether the given BeautifulSoup tag is the correct table to process. """ # Should return False on non- tags and None soup = BeautifulSoup("", "html.parser") assert html.identify_table(soup, {}, 0) is False assert html.identify_table(None, {}, 0) is False soup = BeautifulSoup( '

A
B

', "html.parser", ).table assert html.identify_table(soup, {}, 2) is False assert html.identify_table(soup, {}, 1) is True # Default index of 1 # Same tests, but with explicit parameter assert html.identify_table(soup, {"table_id": 2}, 1) is False assert html.identify_table(soup, {"table_id": 1}, 1) is True # Test identification by string ID assert html.identify_table(soup, {"table_id": "bar"}, 1) is False assert html.identify_table(soup, {"table_id": "foo"}, 1) is True @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_missing_data(): """ Test reading a table with missing data """ # First with default where blank => '0' table_in = [ "", "", "", "", "

A

1

", ] dat = Table.read(table_in, format="ascii.html") assert dat.masked is False assert np.all(dat["A"].mask == [True, False]) assert dat["A"].dtype.kind == "i" # Now with a specific value '...' => missing table_in = [ "", "", "", "", "

A
...
1

", ] dat = Table.read(table_in, format="ascii.html", fill_values=[("...", "0")]) assert dat.masked is False assert np.all(dat["A"].mask == [True, False]) assert dat["A"].dtype.kind == "i" @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_rename_cols(): """ Test reading a table and renaming cols """ table_in = [ "", "", "", "

A	B
1	2

", ] # Swap column names dat = Table.read(table_in, format="ascii.html", names=["B", "A"]) assert dat.colnames == ["B", "A"] assert len(dat) == 1 # Swap column names and only include A (the renamed version) dat = Table.read( table_in, format="ascii.html", names=["B", "A"], include_names=["A"] ) assert dat.colnames == ["A"] assert len(dat) == 1 assert np.all(dat["A"] == 2) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_no_names(): """ Test reading a table with no column header """ table_in = ["", "", "", "

"] dat = Table.read(table_in, format="ascii.html") assert dat.colnames == ["col1"] assert len(dat) == 2 dat = Table.read(table_in, format="ascii.html", names=["a"]) assert dat.colnames == ["a"] assert len(dat) == 2 @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_identify_table_fail(): """ Raise an exception with an informative error message if table_id is not found. """ table_in = ['', "

A
B

"] with pytest.raises(core.InconsistentTableError) as err: Table.read( table_in, format="ascii.html", htmldict={"table_id": "bad_id"}, guess=False ) assert err.match("ERROR: HTML table id 'bad_id' not found$") with pytest.raises(core.InconsistentTableError) as err: Table.read(table_in, format="ascii.html", htmldict={"table_id": 3}, guess=False) assert err.match("ERROR: HTML table number 3 not found$") @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_backend_parsers(): """ Make sure the user can specify which back-end parser to use and that an error is raised if the parser is invalid. """ for parser in ("lxml", "xml", "html.parser", "html5lib"): try: Table.read( "data/html2.html", format="ascii.html", htmldict={"parser": parser}, guess=False, ) except FeatureNotFound: if parser == "html.parser": raise # otherwise ignore if the dependency isn't present # reading should fail if the parser is invalid with pytest.raises(FeatureNotFound): Table.read( "data/html2.html", format="ascii.html", htmldict={"parser": "foo"}, guess=False, ) @pytest.mark.skipif(HAS_BS4, reason="requires no BeautifulSoup4") def test_htmlinputter_no_bs4(): """ This should return an OptionalTableImportError if BeautifulSoup is not installed. """ inputter = html.HTMLInputter() with pytest.raises(core.OptionalTableImportError): inputter.process_lines([]) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmlinputter(): """ Test to ensure that HTMLInputter correctly converts input into a list of SoupStrings representing table elements. """ f = "data/html.html" with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} # In absence of table_id, defaults to the first table expected = [ "Column 1Column 2Column 3", "1a1.05", "2b2.75", "3c-1.25", ] assert [str(x) for x in inputter.get_lines(table)] == expected # Should raise an InconsistentTableError if the table is not found inputter.html = {"table_id": 4} with pytest.raises(core.InconsistentTableError): inputter.get_lines(table) # Identification by string ID inputter.html["table_id"] = "second" expected = [ "Column AColumn BColumn C", "4d10.5", "5e27.5", "6f-12.5", ] assert [str(x) for x in inputter.get_lines(table)] == expected # Identification by integer index inputter.html["table_id"] = 3 expected = [ "C1C2C3", "7g105.0", "8h275.0", "9i-125.0", ] assert [str(x) for x in inputter.get_lines(table)] == expected @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmlsplitter(): """ Test to make sure that HTMLSplitter correctly inputs lines of type SoupString to return a generator that gives all header and data elements. """ splitter = html.HTMLSplitter() lines = [ html.SoupString( BeautifulSoup( "

Col 1	Col 2

", "html.parser" ).tr ), html.SoupString( BeautifulSoup( "

Data 1

Data 2

", "html.parser" ).tr ), ] expected_data = [["Col 1", "Col 2"], ["Data 1", "Data 2"]] assert list(splitter(lines)) == expected_data # Make sure the presence of a non-SoupString triggers a TypeError lines.append("Data 3Data 4") with pytest.raises(TypeError): list(splitter(lines)) # Make sure that passing an empty list triggers an error with pytest.raises(core.InconsistentTableError): list(splitter([])) @pytest.mark.parametrize( "get_table", [ lambda path: os.fspath(path), lambda path: Path(path), lambda path: Path(path).read_text(), ], ) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmlheader_start(get_table): """ Test to ensure that the start_line method of HTMLHeader returns the first line of header data. Uses t/html.html for sample input. """ table_file = "data/html.html" table = get_table(table_file) inputter = html.HTMLInputter() inputter.html = {} header = html.HTMLHeader() lines = inputter.get_lines(table) assert ( str(lines[header.start_line(lines)]) == "Column 1Column 2Column 3" ) inputter.html["table_id"] = "second" lines = inputter.get_lines(table) assert ( str(lines[header.start_line(lines)]) == "Column AColumn BColumn C" ) inputter.html["table_id"] = 3 lines = inputter.get_lines(table) assert ( str(lines[header.start_line(lines)]) == "C1C2C3" ) # start_line should return None if no valid header is found lines = [ html.SoupString( BeautifulSoup("

Data

", "html.parser").tr ), html.SoupString(BeautifulSoup("

Text

", "html.parser").p), ] assert header.start_line(lines) is None # Should raise an error if a non-SoupString is present lines.append("Header") with pytest.raises(TypeError): header.start_line(lines) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmldata(): """ Test to ensure that the start_line and end_lines methods of HTMLData returns the first line of table data. Uses t/html.html for sample input. """ f = "data/html.html" with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} data = html.HTMLData() lines = inputter.get_lines(table) assert ( str(lines[data.start_line(lines)]) == "1a1.05" ) # end_line returns the index of the last data element + 1 assert ( str(lines[data.end_line(lines) - 1]) == "3c-1.25" ) inputter.html["table_id"] = "second" lines = inputter.get_lines(table) assert ( str(lines[data.start_line(lines)]) == "4d10.5" ) assert ( str(lines[data.end_line(lines) - 1]) == "6f-12.5" ) inputter.html["table_id"] = 3 lines = inputter.get_lines(table) assert ( str(lines[data.start_line(lines)]) == "7g105.0" ) assert ( str(lines[data.end_line(lines) - 1]) == "9i-125.0" ) # start_line should raise an error if no table data exists lines = [ html.SoupString(BeautifulSoup("

", "html.parser").div), html.SoupString(BeautifulSoup("

Text

", "html.parser").p), ] with pytest.raises(core.InconsistentTableError): data.start_line(lines) # end_line should return None if no table data exists assert data.end_line(lines) is None # Should raise an error if a non-SoupString is present lines.append("Data") with pytest.raises(TypeError): data.start_line(lines) with pytest.raises(TypeError): data.end_line(lines) def test_multicolumn_write(): """ Test to make sure that the HTML writer writes multidimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")] table = Table([col1, col2, col3], names=("C1", "C2", "C3")) expected = """\

C1	C2		C3
1	1.0	1.0	a	a	a
2	2.0	2.0	b	b	b
3	3.0	3.0	c	c	c

""" out = html.HTML().write(table)[0].strip() assert out == expected.strip() @pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach") def test_multicolumn_write_escape(): """ Test to make sure that the HTML writer writes multidimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [("", "", "a"), ("", "b", "b"), ("c", "c", "c")] table = Table([col1, col2, col3], names=("C1", "C2", "C3")) expected = """\

C1	C2		C3
1	1.0	1.0			a
2	2.0	2.0		b	b
3	3.0	3.0	c	c	c

""" out = html.HTML(htmldict={"raw_html_cols": "C3"}).write(table)[0].strip() assert out == expected.strip() def test_write_no_multicols(): """ Test to make sure that the HTML writer will not use multi-dimensional columns if the multicol parameter is False. """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")] table = Table([col1, col2, col3], names=("C1", "C2", "C3")) expected = """\

C1	C2	C3
1	1.0 .. 1.0	a .. a
2	2.0 .. 2.0	b .. b
3	3.0 .. 3.0	c .. c

""" assert html.HTML({"multicol": False}).write(table)[0].strip() == expected.strip() @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_multicolumn_read(): """ Test to make sure that the HTML reader inputs multidimensional columns (those with iterable elements) using the colspan attribute of . Ensure that any string element within a multidimensional column casts all elements to string prior to type conversion operations. """ table = Table.read("data/html2.html", format="ascii.html") str_type = np.dtype((str, 21)) expected = Table( np.array( [(["1", "2.5000000000000000001"], 3), (["1a", "1"], 3.5)], dtype=[("A", str_type, (2,)), ("B", "x"], ["y"]], names=["a", "b"]) # One column contains raw HTML (string input) out = StringIO() t.write(out, format="ascii.html", htmldict={"raw_html_cols": "a"}) expected = """\ x <em>y</em> """ assert expected in out.getvalue() # One column contains raw HTML (list input) out = StringIO() t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a"]}) assert expected in out.getvalue() # Two columns contains raw HTML (list input) out = StringIO() t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a", "b"]}) expected = """\ x y """ assert expected in out.getvalue() @pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach") def test_raw_html_write_clean(): """ Test that columns can contain raw HTML which is not escaped. """ import bleach t = Table( [[""], ["

"], ["y"]], names=["a", "b", "c"] ) # Confirm that