From 4dddbe59f901ef9774a5fe095ff141faae9cdfbb Mon Sep 17 00:00:00 2001
From: virusdefender
Date: Thu, 3 Mar 2016 17:37:05 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0clean=5Fhtml=E7=9A=84?=
=?UTF-8?q?=E7=88=B6=E7=B1=BB=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
robots/pat.py | 3 ---
robots/robot.py | 8 ++++++++
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/robots/pat.py b/robots/pat.py
index 5e21130..263fcbd 100644
--- a/robots/pat.py
+++ b/robots/pat.py
@@ -55,9 +55,6 @@ class PATRobot(Robot):
data["id"] = problem_id
return data
- def _clean_html(self, text):
- return self._decode_html(re.compile("|
|||\r|\n||").sub("", text))
-
def _regex_page(self, url, regex):
r = self.get(url)
self.check_status_code(r)
diff --git a/robots/robot.py b/robots/robot.py
index f16c1ea..f01895d 100644
--- a/robots/robot.py
+++ b/robots/robot.py
@@ -1,4 +1,5 @@
# coding=utf-8
+import re
import html
import requests
from .exceptions import RequestFailed, RegexError
@@ -91,3 +92,10 @@ class Robot(object):
if response.status_code != status_code:
raise RequestFailed("Invalid status code [%d] when fetching url [%s], expected %d" %
(response.status_code, response.url, status_code))
+
+ def _clean_html(self, text):
+ # 先去除部分html标记
+ p1 = self._decode_html(re.compile(r"|
||||||").sub("", text))
+ #
之类的转换为\n
+ p2 = re.compile(r"").sub(r"\n", p1)
+ return p2