flairNLP · MaxDall · Mar 8, 2024 · Mar 7, 2024 · Mar 8, 2024 · addie9800
diff --git a/src/fundus/publishers/us/the_nation_parser.py b/src/fundus/publishers/us/the_nation_parser.py
@@ -1,6 +1,8 @@
+import re
 from datetime import date, datetime
 from typing import List, Optional
 
+import lxml.html
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
@@ -23,14 +25,32 @@ class V1(BaseParser):
         # Currently(lxml 4.9.3), lxml does not accept p tags within any heading (h*) tag.
         # The "correct" selector would be ".article-header-content > h2 > p"
         _summary_selector: XPath = CSSSelector(".article-header-content > h2")
-        _paragraph_selector = CSSSelector(".article-body-inner > p")
+        _paragraph_selector: XPath = CSSSelector(".article-body-inner > p")
         _aside_selector = CSSSelector("aside")
 
+        _html_fix_pattern = re.compile(r'name="sft_double_opt_sail"\s*value="yes"\s/>\s*</form>')
+
+        # there is a known issue with broken HTML regarding this publisher.
+        # div.cta subtree is malformed, there is a missing closing div tag before the </form> tag, which
+        # prevents lxml from properly parsing the text. I don't know if this is also a problem within V1,
+        # but better safe than sorry, so I added it to V1 base class
+        @function(priority=1)
+        def _fix_malformed_html(self) -> None:
+            if self.precomputed.doc.xpath("//div[contains(@id, 'cta-block')]"):
+                fixed_html = re.sub(
+                    self._html_fix_pattern,
+                    'name="sft_double_opt_sail"value="yes"/></div></form>',
+                    self.precomputed.html,
+                )
+                with open("test_nation.html", "w", encoding="utf-8") as file:
+                    file.write(self.precomputed.html)
+                self.precomputed.doc = lxml.html.document_fromstring(fixed_html)
+
         # We remove aside tags here because the provided HTML does not enclose <p> tags
         # within .article-header-content. As a result, <aside> tags following <p> tags get attached
         # to the paragraph. This is valid HTML5 behaviour.
         # see https://stackoverflow.com/questions/8460993/p-end-tag-p-is-not-needed-in-html
-        @function(priority=1)
+        @function(priority=2)
         def _remove_aside(self) -> None:
             for aside in self._aside_selector(self.precomputed.doc):
                 if (parent := aside.getparent()) is not None:
@@ -63,8 +83,13 @@ def topics(self) -> List[str]:
     class V2(V1):
         VALID_UNTIL = date.today()
 
-        _summary_selector: XPath = XPath("//article//div[contains(@class, 'article-title')]//p")
-        _paragraph_selector: CSSSelector = CSSSelector("article > p")
+        # oh boy, TheNation is really a mess. they changed the layout 2023|7|22 but somehow the old articles still
+        # use the old layout for main content, so we concatenate XPath from V1 onto V1_1.
+
+        _summary_selector = XPath(
+            "//div[@class='article-header-content'] /h2 | //div[contains(@class, 'article-title')] /p"
+        )
+        _paragraph_selector = XPath("(//article | //div[@class='article-body-inner']) / p")
 
         # remove aside function from V1
         def _remove_aside(self):