# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
This module tests some of the methods related to the ``HTML``
reader/writer and aims to document its functionality.
Requires `BeautifulSoup `_
to be installed.
"""
import os
from io import StringIO
from pathlib import Path
import numpy as np
import pytest
from astropy.io import ascii
from astropy.io.ascii import core, html
from astropy.table import Table
from astropy.utils.compat.optional_deps import HAS_BLEACH, HAS_BS4
from .common import setup_function, teardown_function # noqa: F401
if HAS_BS4:
from bs4 import BeautifulSoup, FeatureNotFound
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_soupstring():
"""
Test to make sure the class SoupString behaves properly.
"""
soup = BeautifulSoup(
"
foo
", "html.parser"
)
soup_str = html.SoupString(soup)
assert isinstance(soup_str, str)
assert isinstance(soup_str, html.SoupString)
assert soup_str == "foo
"
assert soup_str.soup is soup
def test_listwriter():
"""
Test to make sure the class ListWriter behaves properly.
"""
lst = []
writer = html.ListWriter(lst)
for i in range(5):
writer.write(i)
for ch in "abcde":
writer.write(ch)
assert lst == [0, 1, 2, 3, 4, "a", "b", "c", "d", "e"]
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_identify_table():
"""
Test to make sure that identify_table() returns whether the
given BeautifulSoup tag is the correct table to process.
"""
# Should return False on non- tags and None
soup = BeautifulSoup("", "html.parser")
assert html.identify_table(soup, {}, 0) is False
assert html.identify_table(None, {}, 0) is False
soup = BeautifulSoup(
'',
"html.parser",
).table
assert html.identify_table(soup, {}, 2) is False
assert html.identify_table(soup, {}, 1) is True # Default index of 1
# Same tests, but with explicit parameter
assert html.identify_table(soup, {"table_id": 2}, 1) is False
assert html.identify_table(soup, {"table_id": 1}, 1) is True
# Test identification by string ID
assert html.identify_table(soup, {"table_id": "bar"}, 1) is False
assert html.identify_table(soup, {"table_id": "foo"}, 1) is True
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_missing_data():
"""
Test reading a table with missing data
"""
# First with default where blank => '0'
table_in = [
"",
]
dat = Table.read(table_in, format="ascii.html")
assert dat.masked is False
assert np.all(dat["A"].mask == [True, False])
assert dat["A"].dtype.kind == "i"
# Now with a specific value '...' => missing
table_in = [
"",
]
dat = Table.read(table_in, format="ascii.html", fill_values=[("...", "0")])
assert dat.masked is False
assert np.all(dat["A"].mask == [True, False])
assert dat["A"].dtype.kind == "i"
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_rename_cols():
"""
Test reading a table and renaming cols
"""
table_in = [
"",
]
# Swap column names
dat = Table.read(table_in, format="ascii.html", names=["B", "A"])
assert dat.colnames == ["B", "A"]
assert len(dat) == 1
# Swap column names and only include A (the renamed version)
dat = Table.read(
table_in, format="ascii.html", names=["B", "A"], include_names=["A"]
)
assert dat.colnames == ["A"]
assert len(dat) == 1
assert np.all(dat["A"] == 2)
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_no_names():
"""
Test reading a table with no column header
"""
table_in = [""]
dat = Table.read(table_in, format="ascii.html")
assert dat.colnames == ["col1"]
assert len(dat) == 2
dat = Table.read(table_in, format="ascii.html", names=["a"])
assert dat.colnames == ["a"]
assert len(dat) == 2
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_identify_table_fail():
"""
Raise an exception with an informative error message if table_id
is not found.
"""
table_in = ['"]
with pytest.raises(core.InconsistentTableError) as err:
Table.read(
table_in, format="ascii.html", htmldict={"table_id": "bad_id"}, guess=False
)
assert err.match("ERROR: HTML table id 'bad_id' not found$")
with pytest.raises(core.InconsistentTableError) as err:
Table.read(table_in, format="ascii.html", htmldict={"table_id": 3}, guess=False)
assert err.match("ERROR: HTML table number 3 not found$")
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_backend_parsers():
"""
Make sure the user can specify which back-end parser to use
and that an error is raised if the parser is invalid.
"""
for parser in ("lxml", "xml", "html.parser", "html5lib"):
try:
Table.read(
"data/html2.html",
format="ascii.html",
htmldict={"parser": parser},
guess=False,
)
except FeatureNotFound:
if parser == "html.parser":
raise
# otherwise ignore if the dependency isn't present
# reading should fail if the parser is invalid
with pytest.raises(FeatureNotFound):
Table.read(
"data/html2.html",
format="ascii.html",
htmldict={"parser": "foo"},
guess=False,
)
@pytest.mark.skipif(HAS_BS4, reason="requires no BeautifulSoup4")
def test_htmlinputter_no_bs4():
"""
This should return an OptionalTableImportError if BeautifulSoup
is not installed.
"""
inputter = html.HTMLInputter()
with pytest.raises(core.OptionalTableImportError):
inputter.process_lines([])
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmlinputter():
"""
Test to ensure that HTMLInputter correctly converts input
into a list of SoupStrings representing table elements.
"""
f = "data/html.html"
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
# In absence of table_id, defaults to the first table
expected = [
"| Column 1 | Column 2 | Column 3 |
",
"| 1 | a | 1.05 |
",
"| 2 | b | 2.75 |
",
"| 3 | c | -1.25 |
",
]
assert [str(x) for x in inputter.get_lines(table)] == expected
# Should raise an InconsistentTableError if the table is not found
inputter.html = {"table_id": 4}
with pytest.raises(core.InconsistentTableError):
inputter.get_lines(table)
# Identification by string ID
inputter.html["table_id"] = "second"
expected = [
"| Column A | Column B | Column C |
",
"| 4 | d | 10.5 |
",
"| 5 | e | 27.5 |
",
"| 6 | f | -12.5 |
",
]
assert [str(x) for x in inputter.get_lines(table)] == expected
# Identification by integer index
inputter.html["table_id"] = 3
expected = [
"| C1 | C2 | C3 |
",
"| 7 | g | 105.0 |
",
"| 8 | h | 275.0 |
",
"| 9 | i | -125.0 |
",
]
assert [str(x) for x in inputter.get_lines(table)] == expected
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmlsplitter():
"""
Test to make sure that HTMLSplitter correctly inputs lines
of type SoupString to return a generator that gives all
header and data elements.
"""
splitter = html.HTMLSplitter()
lines = [
html.SoupString(
BeautifulSoup(
"", "html.parser"
).tr
),
html.SoupString(
BeautifulSoup(
"", "html.parser"
).tr
),
]
expected_data = [["Col 1", "Col 2"], ["Data 1", "Data 2"]]
assert list(splitter(lines)) == expected_data
# Make sure the presence of a non-SoupString triggers a TypeError
lines.append("| Data 3 | Data 4 |
")
with pytest.raises(TypeError):
list(splitter(lines))
# Make sure that passing an empty list triggers an error
with pytest.raises(core.InconsistentTableError):
list(splitter([]))
@pytest.mark.parametrize(
"get_table",
[
lambda path: os.fspath(path),
lambda path: Path(path),
lambda path: Path(path).read_text(),
],
)
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmlheader_start(get_table):
"""
Test to ensure that the start_line method of HTMLHeader
returns the first line of header data. Uses t/html.html
for sample input.
"""
table_file = "data/html.html"
table = get_table(table_file)
inputter = html.HTMLInputter()
inputter.html = {}
header = html.HTMLHeader()
lines = inputter.get_lines(table)
assert (
str(lines[header.start_line(lines)])
== "| Column 1 | Column 2 | Column 3 |
"
)
inputter.html["table_id"] = "second"
lines = inputter.get_lines(table)
assert (
str(lines[header.start_line(lines)])
== "| Column A | Column B | Column C |
"
)
inputter.html["table_id"] = 3
lines = inputter.get_lines(table)
assert (
str(lines[header.start_line(lines)])
== "| C1 | C2 | C3 |
"
)
# start_line should return None if no valid header is found
lines = [
html.SoupString(
BeautifulSoup("", "html.parser").tr
),
html.SoupString(BeautifulSoup("Text
", "html.parser").p),
]
assert header.start_line(lines) is None
# Should raise an error if a non-SoupString is present
lines.append("| Header |
")
with pytest.raises(TypeError):
header.start_line(lines)
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_htmldata():
"""
Test to ensure that the start_line and end_lines methods
of HTMLData returns the first line of table data. Uses
t/html.html for sample input.
"""
f = "data/html.html"
with open(f) as fd:
table = fd.read()
inputter = html.HTMLInputter()
inputter.html = {}
data = html.HTMLData()
lines = inputter.get_lines(table)
assert (
str(lines[data.start_line(lines)])
== "| 1 | a | 1.05 |
"
)
# end_line returns the index of the last data element + 1
assert (
str(lines[data.end_line(lines) - 1])
== "| 3 | c | -1.25 |
"
)
inputter.html["table_id"] = "second"
lines = inputter.get_lines(table)
assert (
str(lines[data.start_line(lines)])
== "| 4 | d | 10.5 |
"
)
assert (
str(lines[data.end_line(lines) - 1])
== "| 6 | f | -12.5 |
"
)
inputter.html["table_id"] = 3
lines = inputter.get_lines(table)
assert (
str(lines[data.start_line(lines)])
== "| 7 | g | 105.0 |
"
)
assert (
str(lines[data.end_line(lines) - 1])
== "| 9 | i | -125.0 |
"
)
# start_line should raise an error if no table data exists
lines = [
html.SoupString(BeautifulSoup("", "html.parser").div),
html.SoupString(BeautifulSoup("Text
", "html.parser").p),
]
with pytest.raises(core.InconsistentTableError):
data.start_line(lines)
# end_line should return None if no table data exists
assert data.end_line(lines) is None
# Should raise an error if a non-SoupString is present
lines.append("| Data |
")
with pytest.raises(TypeError):
data.start_line(lines)
with pytest.raises(TypeError):
data.end_line(lines)
def test_multicolumn_write():
"""
Test to make sure that the HTML writer writes multidimensional
columns (those with iterable elements) using the colspan
attribute of .
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")]
table = Table([col1, col2, col3], names=("C1", "C2", "C3"))
expected = """\
| C1 |
C2 |
C3 |
| 1 |
1.0 |
1.0 |
a |
a |
a |
| 2 |
2.0 |
2.0 |
b |
b |
b |
| 3 |
3.0 |
3.0 |
c |
c |
c |
"""
out = html.HTML().write(table)[0].strip()
assert out == expected.strip()
@pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach")
def test_multicolumn_write_escape():
"""
Test to make sure that the HTML writer writes multidimensional
columns (those with iterable elements) using the colspan
attribute of | .
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [("", "", "a"), ("", "b", "b"), ("c", "c", "c")]
table = Table([col1, col2, col3], names=("C1", "C2", "C3"))
expected = """\
| C1 |
C2 |
C3 |
| 1 |
1.0 |
1.0 |
|
|
a |
| 2 |
2.0 |
2.0 |
|
b |
b |
| 3 |
3.0 |
3.0 |
c |
c |
c |
"""
out = html.HTML(htmldict={"raw_html_cols": "C3"}).write(table)[0].strip()
assert out == expected.strip()
def test_write_no_multicols():
"""
Test to make sure that the HTML writer will not use
multi-dimensional columns if the multicol parameter
is False.
"""
col1 = [1, 2, 3]
col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]
col3 = [("a", "a", "a"), ("b", "b", "b"), ("c", "c", "c")]
table = Table([col1, col2, col3], names=("C1", "C2", "C3"))
expected = """\
| C1 |
C2 |
C3 |
| 1 |
1.0 .. 1.0 |
a .. a |
| 2 |
2.0 .. 2.0 |
b .. b |
| 3 |
3.0 .. 3.0 |
c .. c |
"""
assert html.HTML({"multicol": False}).write(table)[0].strip() == expected.strip()
@pytest.mark.skipif(not HAS_BS4, reason="requires BeautifulSoup4")
def test_multicolumn_read():
"""
Test to make sure that the HTML reader inputs multidimensional
columns (those with iterable elements) using the colspan
attribute of | .
Ensure that any string element within a multidimensional column
casts all elements to string prior to type conversion operations.
"""
table = Table.read("data/html2.html", format="ascii.html")
str_type = np.dtype((str, 21))
expected = Table(
np.array(
[(["1", "2.5000000000000000001"], 3), (["1a", "1"], 3.5)],
dtype=[("A", str_type, (2,)), ("B", "x"], ["y"]], names=["a", "b"])
# One column contains raw HTML (string input)
out = StringIO()
t.write(out, format="ascii.html", htmldict={"raw_html_cols": "a"})
expected = """\
| x |
<em>y</em> |
"""
assert expected in out.getvalue()
# One column contains raw HTML (list input)
out = StringIO()
t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a"]})
assert expected in out.getvalue()
# Two columns contains raw HTML (list input)
out = StringIO()
t.write(out, format="ascii.html", htmldict={"raw_html_cols": ["a", "b"]})
expected = """\
| x |
y |
"""
assert expected in out.getvalue()
@pytest.mark.skipif(not HAS_BLEACH, reason="requires bleach")
def test_raw_html_write_clean():
"""
Test that columns can contain raw HTML which is not escaped.
"""
import bleach
t = Table(
[[""], ["y "], ["y"]], names=["a", "b", "c"]
)
# Confirm that |