Merge branch 'master' into server

* master: pat增加save方法和spj返回字段增加提示字段和转换PAT时间内存单位修正部分bug - 修正robots类中过滤掉<img>标签的错误 - 增加HdojRobots中"spj"以及"hint"的功能增加提示字段和转换PAT时间内存单位将内存由单位KB转换为单位M 删除无用文件修正编码问题,格式问题,html解码问题完成Hduoj提交代码与获取结果的功能修正hdoj爬虫部分bug 提交hdoj爬取题目的代码
2024-12-28 15:31:48 +00:00 · 2016-03-06 19:02:04 +08:00 · 2016-03-06 19:02:04 +08:00 · ce9e7457ef
commit ce9e7457ef
parent 9eb82a7c3f ad0a988efb
3 changed files with 169 additions and 4 deletions
--- a/robots/hduoj.py
+++ b/robots/hduoj.py
@ -0,0 +1,155 @@
+# coding=utf-8
+import re
+import requests
+from .robot import Robot
+from .exceptions import AuthFailed, RequestFailed, RegexError, SubmitProblemFailed
+from .utils import Language, Result
+
+
+class HduojRobot(Robot):
+    def check_url(self, url):
+        regex = r"^http://acm.hdu.edu.cn/showproblem.php\?pid=\d{4}$"
+        return re.compile(regex).match(url) is not None
+
+    def login(self, username, password):
+        r = self.post("http://acm.hdu.edu.cn/userloginex.php?action=login",
+                      data={"username": username,
+                            "userpass": password,
+                            "login": "Sign In"},
+                      headers={"Content-Type": "application/x-www-form-urlencoded",
+                               "Referer": "http://acm.hdu.edu.cn/"})
+        # 登陆成功会重定向到首页,否则200返回错误页面
+        if r.status_code != 302:
+            raise AuthFailed("Failed to login Hduoj")
+
+        self.cookies = dict(r.cookies)
+
+    @property
+    def is_logged_in(self):
+        r = self.get("http://acm.hdu.edu.cn/control_panel.php", cookies=self.cookies)
+        # 登录状态是200,否则302到登陆页面
+        return r.status_code == 200
+
+    def get(self, url, headers=None, cookies=None, allow_redirects=False):
+        r = super().get(url, headers=headers, cookies=cookies, allow_redirects=allow_redirects)
+        r.encoding = "gb2312"
+        return r
+
+    def _regex_page(self, url, regex):
+        r = self.get(url)
+        self.check_status_code(r)
+        data = {}
+        for k, v in regex.items():
+            items = re.compile(v).findall(r.text)
+            if not items:
+                if k == "spj":
+                    data[k] = False
+                elif k == "hint":
+                    data["hint"] = None
+                else:
+                    raise RegexError("No such data")
+            if k == "samples":
+                data[k] = [{"input": items[0], "output": items[1]}]
+            elif items:
+                if k == "spj":
+                    data[k] = True
+                else:
+                    data[k] = self._clean_html(items[0])
+        data["memory_limit"] = int(data["memory_limit"]) // 1024
+        data["time_limit"] = int(data["time_limit"])
+
+        return data
+
+    def get_problem(self, url):
+        if not self.check_url(url):
+            raise RequestFailed("Invaild Hduoj url")
+        regex = {"title": r"<h1 style='color:#1A5CC8'>(.*)</h1>",
+                 "time_limit": r"Time Limit:\s*[\d]*/([\d]*)\s*MS",
+                 "memory_limit": r"Memory Limit:\s*[\d]*/([\d]*)\s*K",
+                 "description": r"Problem Description</div>\s*<div class=panel_content>([\s\S]*?)</div>",
+                 "input_description": r"Input</div>\s*<div class=panel_content>([\s\S]*?)</div>",
+                 "output_description": r"Output</div>\s*<div class=panel_content>([\s\S]*?)</div>",
+                 "hint": r"Hint(?:[\s\S]*?Hint[\s\S]*?</i>|</i>\s*</div>)([\s\S]*?)</div>",
+                 "spj": r"<font color=red>Special Judge</font>",
+                 "samples": r'Courier New,Courier,monospace;">([\s\S]*?)(?:<div|</div>)'}
+        problem_id = re.compile(r"\d{4}").search(url).group()
+        data = self._regex_page(url, regex)
+        data["problem_id"] = problem_id
+        data["submit_url"] = "http://acm.hdu.edu.cn/submit.php?action=submit"
+        return data
+
+    def submit(self, submit_url, language, code, origin_id):
+        code = code.encode("gb2312")
+        if language == Language.C:
+            language = "1"
+        elif language == Language.CPP:
+            language = "0"
+        else:
+            language = "5"
+
+        r = self.post(submit_url, data={"check": "0", "problemid": origin_id,
+                                        "language": language,
+                                        "usercode": code},
+                      cookies=self.cookies,
+                      headers={"Content-Type": "application/x-www-form-urlencoded",
+                               "Referer": submit_url})
+
+        if r.status_code != 302:
+            raise SubmitProblemFailed("Faild to submit problem, url: %s, status code %d" % (submit_url, r.status_code))
+
+    def get_result(self, submission_id, username):
+        status_url = r"http://acm.hdu.edu.cn/status.php?&user=" + username
+        r = self.get(status_url,
+                     headers={"Refer": status_url})
+        self.check_status_code(r)
+
+        data = re.compile(r"(\d+)</td><td>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:[\s\S]*?)>"
+                          r"<font[\s\S]*?>(.*?)</font>[\s\S]*?(\d*)MS</td><td>(\d*)K").findall(r.text)
+
+        submission_id = data[0][0]
+        code = data[0][1]
+
+        if code == "Accepted":
+            result = Result.accepted
+        elif code in ["Queuing", "Compiling", "Running"]:
+            result = Result.waiting
+        elif code == "Presentation Error":
+            result = Result.format_error
+        elif code == "Wrong Answer":
+            result = Result.wrong_answer
+        elif code == "Runtime Error":
+            result = Result.runtime_error
+        elif code == "Time Limit Exceeded":
+            result = Result.time_limit_exceeded
+        elif code == "Memory Limit Exceeded":
+            result = Result.memory_limit_exceeded
+        elif code == "Output Limit Exceeded":
+            result = Result.runtime_error
+        elif code == "Compilation Error":
+            result = Result.compile_error
+        elif code == "System Error":
+            result = Result.system_error
+        else:
+            result = Result.runtime_error
+
+        if data[0][2]:
+            cpu_time = int(data[0][2])
+        else:
+            cpu_time = None
+
+        if data[0][3]:
+            memory = int(data[0][3])
+        else:
+            memory = None
+
+        error = None
+
+        if result == Result.compile_error:
+            r = self.get(r"http://acm.hdu.edu.cn/viewerror.php?rid=" + submission_id,
+                         headers={"Referer": "http://acm.hdu.edu.cn/status.php?first=&pid=&lang=0&status=0&user=" + username})
+            self.check_status_code(r)
+            error = self._clean_html(str(re.compile("<pre>([\s\S]*)</pre>").findall(r.text)))
+
+        return {"result": result, "cpu_time": cpu_time, "memory": memory,
+                "info": {"result_text": self._clean_html(data[0][1])}, "error": error}
+
--- a/robots/pat.py
+++ b/robots/pat.py
@ -6,9 +6,12 @@ from .utils import Language, Result


 class PATRobot(Robot):
-    def __init__(self, cookies=None):
+    def __init__(self, cookies=None, token=""):
        super().__init__(cookies=cookies)
-        self.token = ""
+        self.token = token
+
+    def save(self):
+        return {"cookies": self.cookies, "token": self.token}

    def check_url(self, url):
        regex = r"^https://www.patest.cn/contests/pat-(a|b|t)-practise/1\d{3}$"
@ -56,7 +59,13 @@ class PATRobot(Robot):
                 "submit_url": r'<form accept-charset="UTF-8" action="([\s\S]*?)" method="post">'}
        data = self._regex_page(url, regex)
        data["id"] = problem_id
+        data["time_limit"] = int(data["time_limit"])
+        data["memory_limit"] = int(data["memory_limit"]) // 1024
        data["submit_url"] = "https://www.patest.cn" + data["submit_url"]
+        # pat上都没有提示
+        data["hint"] = None
+        # pat没有spj
+        data["spj"] = False
        return data

    def _regex_page(self, url, regex):
--- a/robots/robot.py
+++ b/robots/robot.py
@ -44,7 +44,8 @@ class Robot(object):
                  "samples": [{"input": String, "output": String}],
                  "spj": True/False,
                  "time_limit": Int ms, 
-                  "memory_limit": Int M}
+                  "memory_limit": Int M,
+                  "hint": String/None}
        """
        raise NotImplementedError()

@ -129,7 +130,7 @@ class Robot(object):

    def _clean_html(self, text):
        # 先去除部分html标记
-        p1 = self._decode_html(re.compile(r"<p.*?>|</p>|<b.*?>|</b>|<span.*?>|</span>|<i.*?>|</i>").sub("", text))
+        p1 = self._decode_html(re.compile(r"<p.*?>|</p>|<b.*?>|</b>|<span.*?>|</span>").sub("", text))
        # <br>之类的转换为\n
        p2 = re.compile(r"<br.*>").sub(r"\n", p1)
        return p2