增加clean_html的父类方法

This commit is contained in:
virusdefender 2016-03-03 17:37:05 +08:00
parent 8e89837ba0
commit 4dddbe59f9
2 changed files with 8 additions and 3 deletions

View File

@ -55,9 +55,6 @@ class PATRobot(Robot):
data["id"] = problem_id
return data
def _clean_html(self, text):
return self._decode_html(re.compile("<p>|</p>|<b>|</b>|\r|\n|<span>|</span>").sub("", text))
def _regex_page(self, url, regex):
r = self.get(url)
self.check_status_code(r)

View File

@ -1,4 +1,5 @@
# coding=utf-8
import re
import html
import requests
from .exceptions import RequestFailed, RegexError
@ -91,3 +92,10 @@ class Robot(object):
if response.status_code != status_code:
raise RequestFailed("Invalid status code [%d] when fetching url [%s], expected %d" %
(response.status_code, response.url, status_code))
def _clean_html(self, text):
# 先去除部分html标记
p1 = self._decode_html(re.compile(r"<p.*?>|</p>|<b.*?>|</b>|<span.*?>|</span>|<i.*?>|</i>").sub("", text))
# <br>之类的转换为\n
p2 = re.compile(r"<br.*>").sub(r"\n", p1)
return p2