diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 5d289d7..51bccc5 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -2,9 +2,11 @@ from urllib.parse import urlparse import copy import unittest +from unittest.mock import patch from lxml import etree +from xextract.extractors import HtmlXPathExtractor, XmlXPathExtractor from xextract.parsers import ( ParserError, ParsingError, BaseParser, BaseNamedParser, Prefix, Group, Element, String, Url, DateTime, Date) @@ -276,6 +278,135 @@ def test_text_extract(self): self.assertEqual(val, 'nice') +class TestElementAsExtractor(unittest.TestCase): + def test_element_as_parser(self): + """ + we can pass an Element as the extractor to parse_*() + """ + html = ''' +
Hello world!
+
+
Hello mars!
+ ''' + + # take only the first containers so we can verify that the correct descendant is chosen + container = Element(css='div', count=3).parse(html)[2] + + val = Element(css='span', count=1).parse_html(container) + self.assertEqual(val.tag, 'span') + self.assertEqual(val.text, 'Hello mars!') + + def test_element_as_parser_html(self): + """ + passing Element as extractor for parse_html should use the correct parser + """ + + html = ''' + +
+ + a<a‘ +
+ + ''' + + original_tostring = etree.tostring + original_parser = HtmlXPathExtractor._parser + + # etree.HtmlParser is immutable so we can't patch it + # instead we patch HtmlXPathExtractor._parser which references it + with patch.object(HtmlXPathExtractor, attribute='_parser', autospec=True) as mock_parser: + mock_parser.side_effect = original_parser + with patch('lxml.etree.tostring', autospec=True) as mock_tostring: + mock_tostring.side_effect = original_tostring + + ancestor = Element(css='div', count=1).parse_html(html) + mock_parser.assert_called_once() + mock_parser.reset_mock() + mock_tostring.assert_not_called() + + script_val = String(css='script', count=1).parse_html(ancestor) + mock_parser.assert_not_called() + mock_tostring.assert_called_once() + self.assertEqual(mock_tostring.call_args.kwargs['method'], 'html') + mock_tostring.reset_mock() + + span_val = String(css='span', count=1).parse_html(ancestor) + mock_parser.assert_not_called() + mock_tostring.assert_called_once() + self.assertEqual(mock_tostring.call_args.kwargs['method'], 'html') + mock_tostring.reset_mock() + + # HTML has different entity processing from XML: + # + a<a‘ + + + + ''' + namespaces = {'a': 'http://test.com/', 'b': 'http://www.w3.org/2001/XMLSchema-instance'} + + original_tostring = etree.tostring + original_parser = XmlXPathExtractor._parser + + # etree.XMLParser is immutable so we can't patch it + # instead we patch XmlXPathExtractor._parser which references it + with patch.object(XmlXPathExtractor, attribute='_parser', autospec=True) as mock_parser: + + mock_parser.side_effect = original_parser + with patch('lxml.etree.tostring', autospec=True) as mock_tostring: + mock_tostring.side_effect = original_tostring + + ancestor = Element(xpath='//a:body', count=1, namespaces=namespaces).parse_xml(xml) + mock_parser.assert_called_once() + mock_parser.reset_mock() + mock_tostring.assert_not_called() + + script_val = String(xpath='//a:script', count=1, namespaces=namespaces).parse_xml(ancestor) + mock_parser.assert_not_called() + mock_tostring.assert_called_once() + self.assertEqual(mock_tostring.call_args.kwargs['method'], 'xml') + mock_tostring.reset_mock() + + span_val = String(xpath='//a:span', count=1, namespaces=namespaces).parse_xml(ancestor) + mock_parser.assert_not_called() + mock_tostring.assert_called_once() + self.assertEqual(mock_tostring.call_args.kwargs['method'], 'xml') + mock_tostring.reset_mock() + + + # XML has different entity processing from HTML: + #