增加clean_html的父类方法

2024-12-29 07:51:45 +00:00 · 2016-03-03 17:37:05 +08:00 · 2016-03-03 17:37:05 +08:00 · 4dddbe59f9
commit 4dddbe59f9
parent 8e89837ba0
2 changed files with 8 additions and 3 deletions
--- a/robots/pat.py
+++ b/robots/pat.py
@ -55,9 +55,6 @@ class PATRobot(Robot):
        data["id"] = problem_id
        return data

-    def _clean_html(self, text):
-        return self._decode_html(re.compile("<p>|</p>|<b>|</b>|\r|\n|<span>|</span>").sub("", text))
-
    def _regex_page(self, url, regex):
        r = self.get(url)
        self.check_status_code(r)
--- a/robots/robot.py
+++ b/robots/robot.py
@ -1,4 +1,5 @@
 # coding=utf-8
+import re
 import html
 import requests
 from .exceptions import RequestFailed, RegexError
@ -91,3 +92,10 @@ class Robot(object):
        if response.status_code != status_code:
            raise RequestFailed("Invalid status code [%d] when fetching url [%s], expected %d" %
                                (response.status_code, response.url, status_code))
+
+    def _clean_html(self, text):
+        # 先去除部分html标记
+        p1 = self._decode_html(re.compile(r"<p.*?>|</p>|<b.*?>|</b>|<span.*?>|</span>|<i.*?>|</i>").sub("", text))
+        # <br>之类的转换为\n
+        p2 = re.compile(r"<br.*>").sub(r"\n", p1)
+        return p2