diff --git a/robots/pat.py b/robots/pat.py index 5e21130..263fcbd 100644 --- a/robots/pat.py +++ b/robots/pat.py @@ -55,9 +55,6 @@ class PATRobot(Robot): data["id"] = problem_id return data - def _clean_html(self, text): - return self._decode_html(re.compile("

|

|||\r|\n||").sub("", text)) - def _regex_page(self, url, regex): r = self.get(url) self.check_status_code(r) diff --git a/robots/robot.py b/robots/robot.py index f16c1ea..f01895d 100644 --- a/robots/robot.py +++ b/robots/robot.py @@ -1,4 +1,5 @@ # coding=utf-8 +import re import html import requests from .exceptions import RequestFailed, RegexError @@ -91,3 +92,10 @@ class Robot(object): if response.status_code != status_code: raise RequestFailed("Invalid status code [%d] when fetching url [%s], expected %d" % (response.status_code, response.url, status_code)) + + def _clean_html(self, text): + # 先去除部分html标记 + p1 = self._decode_html(re.compile(r"|

||||||").sub("", text)) + #
之类的转换为\n + p2 = re.compile(r"").sub(r"\n", p1) + return p2