# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
This module tests some of the methods related to the ``HTML``
reader/writer and aims to document its functionality.
Requires `BeautifulSoup `_
to be installed.
"""
from io import StringIO
import numpy as np
import pytest
from astropy.io import ascii
from astropy.io.ascii import core, html
from astropy.table import Table
from astropy.utils.compat.optional_deps import HAS_BLEACH, HAS_BS4
from .common import setup_function, teardown_function # noqa: F401
if HAS_BS4:
from bs4 import BeautifulSoup, FeatureNotFound
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_soupstring():
"""
Test to make sure the class SoupString behaves properly.
"""
soup = BeautifulSoup(
"
foo
", "html.parser"
)
soup_str = html.SoupString(soup)
assert isinstance(soup_str, str)
assert isinstance(soup_str, html.SoupString)
assert soup_str == "foo
"
assert soup_str.soup is soup
def test_listwriter():
"""
Test to make sure the class ListWriter behaves properly.
"""
lst = []
writer = html.ListWriter(lst)
for i in range(5):
writer.write(i)
for ch in "abcde":
writer.write(ch)
assert lst == [0, 1, 2, 3, 4, "a", "b", "c", "d", "e"]
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_identify_table():
"""
Test to make sure that identify_table() returns whether the
given BeautifulSoup tag is the correct table to process.
"""
# Should return False on non- tags and None
soup = BeautifulSoup("", "html.parser")
assert html.identify_table(soup, {}, 0) is False
assert html.identify_table(None, {}, 0) is False
soup = BeautifulSoup(
'',
"html.parser",
).table
assert html.identify_table(soup, {}, 2) is False
assert html.identify_table(soup, {}, 1) is True # Default index of 1
# Same tests, but with explicit parameter
assert html.identify_table(soup, {"table_id": 2}, 1) is False
assert html.identify_table(soup, {"table_id": 1}, 1) is True
# Test identification by string ID
assert html.identify_table(soup, {"table_id": "bar"}, 1) is False
assert html.identify_table(soup, {"table_id": "foo"}, 1) is True
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_missing_data():
"""
Test reading a table with missing data
"""
# First with default where blank => '0'
table_in = [
"",
]
dat = Table.read(table_in, format="ascii.html")
assert dat.masked is False
assert np.all(dat["A"].mask == [True, False])
assert dat["A"].dtype.kind == "i"
# Now with a specific value '...' => missing
table_in = [
"",
]
dat = Table.read(table_in, format="ascii.html", fill_values=[("...", "0")])
assert dat.masked is False
assert np.all(dat["A"].mask == [True, False])
assert dat["A"].dtype.kind == "i"
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_rename_cols():
"""
Test reading a table and renaming cols
"""
table_in = [
"",
]
# Swap column names
dat = Table.read(table_in, format="ascii.html", names=["B", "A"])
assert dat.colnames == ["B", "A"]
assert len(dat) == 1
# Swap column names and only include A (the renamed version)
dat = Table.read(
table_in, format="ascii.html", names=["B", "A"], include_names=["A"]
)
assert dat.colnames == ["A"]
assert len(dat) == 1
assert np.all(dat["A"] == 2)
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_no_names():
"""
Test reading a table with no column header
"""
table_in = [""]
dat = Table.read(table_in, format="ascii.html")
assert dat.colnames == ["col1"]
assert len(dat) == 2
dat = Table.read(table_in, format="ascii.html", names=["a"])
assert dat.colnames == ["a"]
assert len(dat) == 2
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_identify_table_fail():
"""
Raise an exception with an informative error message if table_id
is not found.
"""
table_in = ['"]
with pytest.raises(core.InconsistentTableError) as err:
Table.read(
table_in, format="ascii.html", htmldict={"table_id": "bad_id"}, guess=False
)
assert err.match("ERROR: HTML table id 'bad_id' not found$")
with pytest.raises(core.InconsistentTableError) as err:
Table.read(table_in, format="ascii.html", htmldict={"table_id": 3}, guess=False)
assert err.match("ERROR: HTML table number 3 not found$")
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_backend_parsers():
"""
Make sure the user can specify which back-end parser to use
and that an error is raised if the parser is invalid.
"""
for parser in ("lxml", "xml", "html.parser", "html5lib"):
try:
Table.read(
"data/html2.html",
format="ascii.html",
htmldict={"parser": parser},
guess=False,
)
except FeatureNotFound:
if parser == "html.parser":
raise
# otherwise ignore if the dependency isn't present
# reading should fail if the parser is invalid
with pytest.raises(FeatureNotFound):
Table.read(
"data/html2.html",
format="ascii.html",
htmldict={"parser": "foo"},
guess=False,
)
@pytest.mark.skipif(HAS_BS4, reason="requires no BeautifulSoup4")
def test_htmlinputter_no_bs4():
"""
This should return an OptionalTableImportError if BeautifulSoup
is not installed.
"""
inputter = html.HTMLInputter()
with pytest.raises(core.OptionalTableImportError):
inputter.process_lines([])
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmlinputter():
"""
Test to ensure that HTMLInputter correctly converts input
into a list of SoupStrings representing table elements.
"""
f = "data/html.html"
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
# In absence of table_id, defaults to the first table
expected = [
"| Column 1 | Column 2 | Column 3 |
",
"| 1 | a | 1.05 |
",
"| 2 | b | 2.75 |
",
"| 3 | c | -1.25 |
",
]
assert [str(x) for x in inputter.get_lines(table)] == expected
# Should raise an InconsistentTableError if the table is not found
inputter.html = {"table_id": 4}
with pytest.raises(core.InconsistentTableError):
inputter.get_lines(table)
# Identification by string ID
inputter.html["table_id"] = "second"
expected = [
"| Column A | Column B | Column C |
",
"| 4 | d | 10.5 |
",
"| 5 | e | 27.5 |
",
"| 6 | f | -12.5 |
",
]
assert [str(x) for x in inputter.get_lines(table)] == expected
# Identification by integer index
inputter.html["table_id"] = 3
expected = [
"| C1 | C2 | C3 |
",
"| 7 | g | 105.0 |
",
"| 8 | h | 275.0 |
",
"| 9 | i | -125.0 |
",
]
assert [str(x) for x in inputter.get_lines(table)] == expected
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmlsplitter():
"""
Test to make sure that HTMLSplitter correctly inputs lines
of type SoupString to return a generator that gives all
header and data elements.
"""
splitter = html.HTMLSplitter()
lines = [
html.SoupString(
BeautifulSoup(
"", "html.parser"
).tr
),
html.SoupString(
BeautifulSoup(
"", "html.parser"
).tr
),
]
expected_data = [["Col 1", "Col 2"], ["Data 1", "Data 2"]]
assert list(splitter(lines)) == expected_data
# Make sure the presence of a non-SoupString triggers a TypeError
lines.append("| Data 3 | Data 4 |
")
with pytest.raises(TypeError):
list(splitter(lines))
# Make sure that passing an empty list triggers an error
with pytest.raises(core.InconsistentTableError):
list(splitter([]))
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmlheader_start():
"""
Test to ensure that the start_line method of HTMLHeader
returns the first line of header data. Uses t/html.html
for sample input.
"""
f = "data/html.html"
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
header = html.HTMLHeader()
lines = inputter.get_lines(table)
assert (
str(lines[header.start_line(lines)])
== "| Column 1 | Column 2 | Column 3 |
"
)
inputter.html["table_id"] = "second"
lines = inputter.get_lines(table)
assert (
str(lines[header.start_line(lines)])
== "| Column A | Column B | Column C |
"
)
inputter.html["table_id"] = 3
lines = inputter.get_lines(table)
assert (
str(lines[header.start_line(lines)])
== "| C1 | C2 | C3 |
"
)
# start_line should return None if no valid header is found
lines = [
html.SoupString(
BeautifulSoup("", "html.parser").tr
),
html.SoupString(BeautifulSoup("Text
", "html.parser").p),
]
assert header.start_line(lines) is None
# Should raise an error if a non-SoupString is present
lines.append("| Header |
")
with pytest.raises(TypeError):
header.start_line(lines)
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmldata():
"""
Test to ensure that the start_line and end_lines methods
of HTMLData returns the first line of table data. Uses
t/html.html for sample input.
"""
f = "data/html.html"
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
data = html.HTMLData()
lines = inputter.get_lines(table)
assert (
str(lines[data.start_line(lines)])
== "| 1 | a | 1.05 |
"
)
# end_line returns the index of the last data element + 1
assert (
str(lines[data.end_line(lines) - 1])
== "| 3 | c | -1.25 |
"
)
inputter.html["table_id"] = "second"
lines = inputter.get_lines(table)
assert (
str(lines[data.start_line(lines)])
== "| 4 | d | 10.5 |
"
)
assert (
str(lines[data.end_line(lines) - 1])
== "| 6 | f | -12.5 |
"
)
inputter.html["table_id"] = 3
lines = inputter.get_lines(table)
assert (
str(lines[data.start_line(lines)])
== "| 7 | g | 105.0 |
"
)
assert (
str(lines[data.end_line(lines) - 1])
== "| 9 | i | -125.0 |
"
)
# start_line should raise an error if no table data exists
lines = [
html.SoupString(BeautifulSoup("", "html.parser").div),
html.SoupString(BeautifulSoup("Text
", "html.parser").p),
]
with pytest.raises(core.InconsistentTableError):
data.start_line(lines)
# end_line should return None if no table data exists
assert data.end_line(lines) is None
# Should raise an error if a non-SoupString is present
lines.append("| Data |
")
with pytest.raises(TypeError):
data.start_line(lines)
with pytest.raises(TypeError):
data.end_line(lines)
def test_multicolumn_write():
"""
Test to make sure that the HTML writer writes multidimensional
columns (those with iterable elements) using the colspan
attribute of .
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")]
table = Table([col1, col2, col3], names=("C1", "C2", "C3"))
expected = """\
| C1 |
C2 |
C3 |
| 1 |
1.0 |
1.0 |
a |
a |
a |
| 2 |
2.0 |
2.0 |
b |
b |
b |
| 3 |
3.0 |
3.0 |
c |
c |
c |
"""
out = html.HTML().write(table)[0].strip()
assert out == expected.strip()
@pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach")
def test_multicolumn_write_escape():
"""
Test to make sure that the HTML writer writes multidimensional
columns (those with iterable elements) using the colspan
attribute of | .
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [("", "", "a"), ("", "b", "b"), ("c", "c", "c")]
table = Table([col1, col2, col3], names=("C1", "C2", "C3"))
expected = """\
| C1 |
C2 |
C3 |
| 1 |
1.0 |
1.0 |
|
|
a |
| 2 |
2.0 |
2.0 |
|
b |
b |
| 3 |
3.0 |
3.0 |
c |
c |
c |
"""
out = html.HTML(htmldict={"raw_html_cols": "C3"}).write(table)[0].strip()
assert out == expected.strip()
def test_write_no_multicols():
"""
Test to make sure that the HTML writer will not use
multi-dimensional columns if the multicol parameter
is False.
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")]
table = Table([col1, col2, col3], names=("C1", "C2", "C3"))
expected = """\
| C1 |
C2 |
C3 |
| 1 |
1.0 .. 1.0 |
a .. a |
| 2 |
2.0 .. 2.0 |
b .. b |
| 3 |
3.0 .. 3.0 |
c .. c |
"""
assert html.HTML({"multicol": False}).write(table)[0].strip() == expected.strip()
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_multicolumn_read():
"""
Test to make sure that the HTML reader inputs multidimensional
columns (those with iterable elements) using the colspan
attribute of | .
Ensure that any string element within a multidimensional column
casts all elements to string prior to type conversion operations.
"""
table = Table.read("data/html2.html", format="ascii.html")
str_type = np.dtype((str, 21))
expected = Table(
np.array(
[(["1", "2.5000000000000000001"], 3), (["1a", "1"], 3.5)],
dtype=[("A", str_type, (2,)), ("B", "x"], ["y"]], names=["a", "b"])
# One column contains raw HTML (string input)
out = StringIO()
t.write(out, format="ascii.html", htmldict={"raw_html_cols": "a"})
expected = """\
| x |
<em>y</em> |
"""
assert expected in out.getvalue()
# One column contains raw HTML (list input)
out = StringIO()
t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a"]})
assert expected in out.getvalue()
# Two columns contains raw HTML (list input)
out = StringIO()
t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a", "b"]})
expected = """\
| x |
y |
"""
assert expected in out.getvalue()
@pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach")
def test_raw_html_write_clean():
"""
Test that columns can contain raw HTML which is not escaped.
"""
import bleach
t = Table(
[[""], ["y "], ["y"]], names=["a", "b", "c"]
)
# Confirm that |