# Licensed under a 3-clause BSD style license - see LICENSE.rst """ This module tests some of the methods related to the ``HTML`` reader/writer and aims to document its functionality. Requires `BeautifulSoup `_ to be installed. """ from io import StringIO import numpy as np import pytest from astropy.io import ascii from astropy.io.ascii import core, html from astropy.table import Table from astropy.utils.compat.optional_deps import HAS_BLEACH, HAS_BS4 from .common import setup_function, teardown_function # noqa: F401 if HAS_BS4: from bs4 import BeautifulSoup, FeatureNotFound @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_soupstring(): """ Test to make sure the class SoupString behaves properly. """ soup = BeautifulSoup( "

foo

", "html.parser" ) soup_str = html.SoupString(soup) assert isinstance(soup_str, str) assert isinstance(soup_str, html.SoupString) assert soup_str == "

foo

" assert soup_str.soup is soup def test_listwriter(): """ Test to make sure the class ListWriter behaves properly. """ lst = [] writer = html.ListWriter(lst) for i in range(5): writer.write(i) for ch in "abcde": writer.write(ch) assert lst == [0, 1, 2, 3, 4, "a", "b", "c", "d", "e"] @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_identify_table(): """ Test to make sure that identify_table() returns whether the given BeautifulSoup tag is the correct table to process. """ # Should return False on non- tags and None soup = BeautifulSoup("", "html.parser") assert html.identify_table(soup, {}, 0) is False assert html.identify_table(None, {}, 0) is False soup = BeautifulSoup( '

A
B

', "html.parser", ).table assert html.identify_table(soup, {}, 2) is False assert html.identify_table(soup, {}, 1) is True # Default index of 1 # Same tests, but with explicit parameter assert html.identify_table(soup, {"table_id": 2}, 1) is False assert html.identify_table(soup, {"table_id": 1}, 1) is True # Test identification by string ID assert html.identify_table(soup, {"table_id": "bar"}, 1) is False assert html.identify_table(soup, {"table_id": "foo"}, 1) is True @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_missing_data(): """ Test reading a table with missing data """ # First with default where blank => '0' table_in = [ "", "", "", "", "

A

1

", ] dat = Table.read(table_in, format="ascii.html") assert dat.masked is False assert np.all(dat["A"].mask == [True, False]) assert dat["A"].dtype.kind == "i" # Now with a specific value '...' => missing table_in = [ "", "", "", "", "

A
...
1

", ] dat = Table.read(table_in, format="ascii.html", fill_values=[("...", "0")]) assert dat.masked is False assert np.all(dat["A"].mask == [True, False]) assert dat["A"].dtype.kind == "i" @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_rename_cols(): """ Test reading a table and renaming cols """ table_in = [ "", "", "", "

A	B
1	2

", ] # Swap column names dat = Table.read(table_in, format="ascii.html", names=["B", "A"]) assert dat.colnames == ["B", "A"] assert len(dat) == 1 # Swap column names and only include A (the renamed version) dat = Table.read( table_in, format="ascii.html", names=["B", "A"], include_names=["A"] ) assert dat.colnames == ["A"] assert len(dat) == 1 assert np.all(dat["A"] == 2) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_no_names(): """ Test reading a table with no column header """ table_in = ["", "", "", "

"] dat = Table.read(table_in, format="ascii.html") assert dat.colnames == ["col1"] assert len(dat) == 2 dat = Table.read(table_in, format="ascii.html", names=["a"]) assert dat.colnames == ["a"] assert len(dat) == 2 @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_identify_table_fail(): """ Raise an exception with an informative error message if table_id is not found. """ table_in = ['', "

A
B

"] with pytest.raises(core.InconsistentTableError) as err: Table.read( table_in, format="ascii.html", htmldict={"table_id": "bad_id"}, guess=False ) assert err.match("ERROR: HTML table id 'bad_id' not found$") with pytest.raises(core.InconsistentTableError) as err: Table.read(table_in, format="ascii.html", htmldict={"table_id": 3}, guess=False) assert err.match("ERROR: HTML table number 3 not found$") @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_backend_parsers(): """ Make sure the user can specify which back-end parser to use and that an error is raised if the parser is invalid. """ for parser in ("lxml", "xml", "html.parser", "html5lib"): try: Table.read( "data/html2.html", format="ascii.html", htmldict={"parser": parser}, guess=False, ) except FeatureNotFound: if parser == "html.parser": raise # otherwise ignore if the dependency isn't present # reading should fail if the parser is invalid with pytest.raises(FeatureNotFound): Table.read( "data/html2.html", format="ascii.html", htmldict={"parser": "foo"}, guess=False, ) @pytest.mark.skipif(HAS_BS4, reason="requires no BeautifulSoup4") def test_htmlinputter_no_bs4(): """ This should return an OptionalTableImportError if BeautifulSoup is not installed. """ inputter = html.HTMLInputter() with pytest.raises(core.OptionalTableImportError): inputter.process_lines([]) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmlinputter(): """ Test to ensure that HTMLInputter correctly converts input into a list of SoupStrings representing table elements. """ f = "data/html.html" with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} # In absence of table_id, defaults to the first table expected = [ "Column 1Column 2Column 3", "1a1.05", "2b2.75", "3c-1.25", ] assert [str(x) for x in inputter.get_lines(table)] == expected # Should raise an InconsistentTableError if the table is not found inputter.html = {"table_id": 4} with pytest.raises(core.InconsistentTableError): inputter.get_lines(table) # Identification by string ID inputter.html["table_id"] = "second" expected = [ "Column AColumn BColumn C", "4d10.5", "5e27.5", "6f-12.5", ] assert [str(x) for x in inputter.get_lines(table)] == expected # Identification by integer index inputter.html["table_id"] = 3 expected = [ "C1C2C3", "7g105.0", "8h275.0", "9i-125.0", ] assert [str(x) for x in inputter.get_lines(table)] == expected @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmlsplitter(): """ Test to make sure that HTMLSplitter correctly inputs lines of type SoupString to return a generator that gives all header and data elements. """ splitter = html.HTMLSplitter() lines = [ html.SoupString( BeautifulSoup( "

Col 1	Col 2

", "html.parser" ).tr ), html.SoupString( BeautifulSoup( "

Data 1

Data 2

", "html.parser" ).tr ), ] expected_data = [["Col 1", "Col 2"], ["Data 1", "Data 2"]] assert list(splitter(lines)) == expected_data # Make sure the presence of a non-SoupString triggers a TypeError lines.append("Data 3Data 4") with pytest.raises(TypeError): list(splitter(lines)) # Make sure that passing an empty list triggers an error with pytest.raises(core.InconsistentTableError): list(splitter([])) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmlheader_start(): """ Test to ensure that the start_line method of HTMLHeader returns the first line of header data. Uses t/html.html for sample input. """ f = "data/html.html" with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} header = html.HTMLHeader() lines = inputter.get_lines(table) assert ( str(lines[header.start_line(lines)]) == "Column 1Column 2Column 3" ) inputter.html["table_id"] = "second" lines = inputter.get_lines(table) assert ( str(lines[header.start_line(lines)]) == "Column AColumn BColumn C" ) inputter.html["table_id"] = 3 lines = inputter.get_lines(table) assert ( str(lines[header.start_line(lines)]) == "C1C2C3" ) # start_line should return None if no valid header is found lines = [ html.SoupString( BeautifulSoup("

Data

", "html.parser").tr ), html.SoupString(BeautifulSoup("

Text

", "html.parser").p), ] assert header.start_line(lines) is None # Should raise an error if a non-SoupString is present lines.append("Header") with pytest.raises(TypeError): header.start_line(lines) @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_htmldata(): """ Test to ensure that the start_line and end_lines methods of HTMLData returns the first line of table data. Uses t/html.html for sample input. """ f = "data/html.html" with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} data = html.HTMLData() lines = inputter.get_lines(table) assert ( str(lines[data.start_line(lines)]) == "1a1.05" ) # end_line returns the index of the last data element + 1 assert ( str(lines[data.end_line(lines) - 1]) == "3c-1.25" ) inputter.html["table_id"] = "second" lines = inputter.get_lines(table) assert ( str(lines[data.start_line(lines)]) == "4d10.5" ) assert ( str(lines[data.end_line(lines) - 1]) == "6f-12.5" ) inputter.html["table_id"] = 3 lines = inputter.get_lines(table) assert ( str(lines[data.start_line(lines)]) == "7g105.0" ) assert ( str(lines[data.end_line(lines) - 1]) == "9i-125.0" ) # start_line should raise an error if no table data exists lines = [ html.SoupString(BeautifulSoup("

", "html.parser").div), html.SoupString(BeautifulSoup("

Text

", "html.parser").p), ] with pytest.raises(core.InconsistentTableError): data.start_line(lines) # end_line should return None if no table data exists assert data.end_line(lines) is None # Should raise an error if a non-SoupString is present lines.append("Data") with pytest.raises(TypeError): data.start_line(lines) with pytest.raises(TypeError): data.end_line(lines) def test_multicolumn_write(): """ Test to make sure that the HTML writer writes multidimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")] table = Table([col1, col2, col3], names=("C1", "C2", "C3")) expected = """\

C1	C2		C3
1	1.0	1.0	a	a	a
2	2.0	2.0	b	b	b
3	3.0	3.0	c	c	c

""" out = html.HTML().write(table)[0].strip() assert out == expected.strip() @pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach") def test_multicolumn_write_escape(): """ Test to make sure that the HTML writer writes multidimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [("", "", "a"), ("", "b", "b"), ("c", "c", "c")] table = Table([col1, col2, col3], names=("C1", "C2", "C3")) expected = """\

C1	C2		C3
1	1.0	1.0			a
2	2.0	2.0		b	b
3	3.0	3.0	c	c	c

""" out = html.HTML(htmldict={"raw_html_cols": "C3"}).write(table)[0].strip() assert out == expected.strip() def test_write_no_multicols(): """ Test to make sure that the HTML writer will not use multi-dimensional columns if the multicol parameter is False. """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")] table = Table([col1, col2, col3], names=("C1", "C2", "C3")) expected = """\

C1	C2	C3
1	1.0 .. 1.0	a .. a
2	2.0 .. 2.0	b .. b
3	3.0 .. 3.0	c .. c

""" assert html.HTML({"multicol": False}).write(table)[0].strip() == expected.strip() @pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4") def test_multicolumn_read(): """ Test to make sure that the HTML reader inputs multidimensional columns (those with iterable elements) using the colspan attribute of . Ensure that any string element within a multidimensional column casts all elements to string prior to type conversion operations. """ table = Table.read("data/html2.html", format="ascii.html") str_type = np.dtype((str, 21)) expected = Table( np.array( [(["1", "2.5000000000000000001"], 3), (["1a", "1"], 3.5)], dtype=[("A", str_type, (2,)), ("B", "x"], ["y"]], names=["a", "b"]) # One column contains raw HTML (string input) out = StringIO() t.write(out, format="ascii.html", htmldict={"raw_html_cols": "a"}) expected = """\ x <em>y</em> """ assert expected in out.getvalue() # One column contains raw HTML (list input) out = StringIO() t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a"]}) assert expected in out.getvalue() # Two columns contains raw HTML (list input) out = StringIO() t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a", "b"]}) expected = """\ x y """ assert expected in out.getvalue() @pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach") def test_raw_html_write_clean(): """ Test that columns can contain raw HTML which is not escaped. """ import bleach t = Table( [[""], ["

"], ["y"]], names=["a", "b", "c"] ) # Confirm that