From 4dddbe59f901ef9774a5fe095ff141faae9cdfbb Mon Sep 17 00:00:00 2001 From: virusdefender Date: Thu, 3 Mar 2016 17:37:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0clean=5Fhtml=E7=9A=84?= =?UTF-8?q?=E7=88=B6=E7=B1=BB=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- robots/pat.py | 3 --- robots/robot.py | 8 ++++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/robots/pat.py b/robots/pat.py index 5e21130..263fcbd 100644 --- a/robots/pat.py +++ b/robots/pat.py @@ -55,9 +55,6 @@ class PATRobot(Robot): data["id"] = problem_id return data - def _clean_html(self, text): - return self._decode_html(re.compile("

|

|||\r|\n||").sub("", text)) - def _regex_page(self, url, regex): r = self.get(url) self.check_status_code(r) diff --git a/robots/robot.py b/robots/robot.py index f16c1ea..f01895d 100644 --- a/robots/robot.py +++ b/robots/robot.py @@ -1,4 +1,5 @@ # coding=utf-8 +import re import html import requests from .exceptions import RequestFailed, RegexError @@ -91,3 +92,10 @@ class Robot(object): if response.status_code != status_code: raise RequestFailed("Invalid status code [%d] when fetching url [%s], expected %d" % (response.status_code, response.url, status_code)) + + def _clean_html(self, text): + # 先去除部分html标记 + p1 = self._decode_html(re.compile(r"|

||||||").sub("", text)) + #
之类的转换为\n + p2 = re.compile(r"").sub(r"\n", p1) + return p2