增加富文本的 xss 过滤

2025-01-16 01:13:47 +00:00 · 2015-09-22 17:03:53 +08:00 · 2015-09-22 17:03:53 +08:00 · c26fd6734d
commit c26fd6734d
parent b224a823fe
8 changed files with 288 additions and 3 deletions
--- a/announcement/migrations/0003_auto_20150922_1703.py
+++ b/announcement/migrations/0003_auto_20150922_1703.py
@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+import utils.models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('announcement', '0002_auto_20150818_1445'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='announcement',
+            name='content',
+            field=utils.models.RichTextField(),
+        ),
+    ]
--- a/announcement/models.py
+++ b/announcement/models.py
@ -3,13 +3,14 @@ from django.db import models

 from account.models import User
 from group.models import Group
+from utils.models import RichTextField


 class Announcement(models.Model):
    # 标题
    title = models.CharField(max_length=50)
    # 公告的内容 HTML 格式
-    content = models.TextField()
+    content = RichTextField()
    # 创建时间
    create_time = models.DateTimeField(auto_now_add=True)
    # 这个公告是谁创建的
--- a/contest/migrations/0010_auto_20150922_1703.py
+++ b/contest/migrations/0010_auto_20150922_1703.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+import utils.models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('contest', '0009_contestsubmission_first_achieved'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='contest',
+            name='description',
+            field=utils.models.RichTextField(),
+        ),
+        migrations.AlterField(
+            model_name='contestproblem',
+            name='description',
+            field=utils.models.RichTextField(),
+        ),
+    ]
--- a/contest/models.py
+++ b/contest/models.py
@ -5,6 +5,8 @@ from django.utils.timezone import now
 from account.models import User
 from problem.models import AbstractProblem
 from group.models import Group
+from utils.models import RichTextField
+

 GROUP_CONTEST = 0
 PUBLIC_CONTEST = 1
@ -17,7 +19,7 @@ CONTEST_UNDERWAY = 0

 class Contest(models.Model):
    title = models.CharField(max_length=40, unique=True)
-    description = models.TextField()
+    description = RichTextField()
    # 比赛模式：0 即为是acm模式，1 即为是按照总的 ac 题目数量排名模式
    mode = models.IntegerField()
    # 是否显示实时排名结果
--- a/problem/migrations/0008_auto_20150922_1702.py
+++ b/problem/migrations/0008_auto_20150922_1702.py
@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+import utils.models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('problem', '0007_remove_problem_last_update_time'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='problem',
+            name='description',
+            field=utils.models.RichTextField(),
+        ),
+    ]
--- a/problem/models.py
+++ b/problem/models.py
@ -2,6 +2,7 @@
 from django.db import models

 from account.models import User
+from utils.models import RichTextField


 class ProblemTag(models.Model):
@ -15,7 +16,7 @@ class AbstractProblem(models.Model):
    # 标题
    title = models.CharField(max_length=50)
    # 问题描述 HTML 格式
-    description = models.TextField()
+    description = RichTextField()
    # 输入描述
    input_description = models.CharField(max_length=10000)
    # 输出描述
--- a/utils/models.py
+++ b/utils/models.py
@ -0,0 +1,14 @@
+# coding=utf-8
+from django.db import models
+
+from utils.xss_filter import XssHtml
+
+
+class RichTextField(models.TextField):
+    __metaclass__ = models.SubfieldBase
+
+    def get_prep_value(self, value):
+        parser = XssHtml()
+        parser.feed(value)
+        parser.close()
+        return parser.getHtml()
--- a/utils/xss_filter.py
+++ b/utils/xss_filter.py
@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+"""
+Python 富文本XSS过滤类
+@package XssHtml
+@version 0.1
+@link http://phith0n.github.io/python-xss-filter
+@since 20150407
+@copyright (c) Phithon All Rights Reserved
+
+Based on native Python module HTMLParser purifier of HTML, To Clear all javascript in html
+You can use it in all python web framework
+Written by Phithon <root@leavesongs.com> in 2015 and placed in the public domain.
+phithon <root@leavesongs.com> 编写于20150407
+From: XDSEC <www.xdsec.org> & 离别歌 <www.leavesongs.com>
+GitHub Pages: https://github.com/phith0n/python-xss-filter
+Usage:
+    parser = XssHtml()
+    parser.feed('<html code>')
+    parser.close()
+    html = parser.getHtml()
+    print html
+
+Requirements
+Python 2.6+ or 3.2+
+Cannot defense xss in browser which is belowed IE7
+浏览器版本：IE7+ 或其他浏览器，无法防御IE6及以下版本浏览器中的XSS
+"""
+import re
+
+try:
+    from html.parser import HTMLParser
+except:
+    from HTMLParser import HTMLParser
+
+
+class XssHtml(HTMLParser):
+    allow_tags = ['a', 'img', 'br', 'strong', 'b', 'code', 'pre',
+                  'p', 'div', 'em', 'span', 'h1', 'h2', 'h3', 'h4',
+                  'h5', 'h6', 'blockquote', 'ul', 'ol', 'tr', 'th', 'td',
+                  'hr', 'li', 'u', 'embed', 's', 'table', 'thead', 'tbody',
+                  'caption', 'small', 'q', 'sup', 'sub']
+    common_attrs = ["style", "class", "name"]
+    nonend_tags = ["img", "hr", "br", "embed"]
+    tags_own_attrs = {
+        "img": ["src", "width", "height", "alt", "align"],
+        "a": ["href", "target", "rel", "title"],
+        "embed": ["src", "width", "height", "type", "allowfullscreen", "loop", "play", "wmode", "menu"],
+        "table": ["border", "cellpadding", "cellspacing"],
+    }
+
+    def __init__(self, allows=[]):
+        HTMLParser.__init__(self)
+        self.allow_tags = allows if allows else self.allow_tags
+        self.result = []
+        self.start = []
+        self.data = []
+
+    def getHtml(self):
+        """
+        Get the safe html code
+        """
+        for i in range(0, len(self.result)):
+            tmp = self.result[i].rstrip('\n')
+            tmp = tmp.lstrip('\n')
+            if tmp:
+                self.data.append(tmp)
+        return ''.join(self.data)
+
+    def handle_startendtag(self, tag, attrs):
+        self.handle_starttag(tag, attrs)
+
+    def handle_starttag(self, tag, attrs):
+        if tag not in self.allow_tags:
+            return
+        end_diagonal = ' /' if tag in self.nonend_tags else ''
+        if not end_diagonal:
+            self.start.append(tag)
+        attdict = {}
+        for attr in attrs:
+            attdict[attr[0]] = attr[1]
+
+        attdict = self._wash_attr(attdict, tag)
+        if hasattr(self, "node_%s" % tag):
+            attdict = getattr(self, "node_%s" % tag)(attdict)
+        else:
+            attdict = self.node_default(attdict)
+
+        attrs = []
+        for (key, value) in attdict.items():
+            attrs.append('%s="%s"' % (key, self._htmlspecialchars(value)))
+        attrs = (' ' + ' '.join(attrs)) if attrs else ''
+        self.result.append('<' + tag + attrs + end_diagonal + '>')
+
+    def handle_endtag(self, tag):
+        if self.start and tag == self.start[len(self.start) - 1]:
+            self.result.append('</' + tag + '>')
+            self.start.pop()
+
+    def handle_data(self, data):
+        self.result.append(self._htmlspecialchars(data))
+
+    def handle_entityref(self, name):
+        if name.isalpha():
+            self.result.append("&%s;" % name)
+
+    def handle_charref(self, name):
+        if name.isdigit():
+            self.result.append("&#%s;" % name)
+
+    def node_default(self, attrs):
+        attrs = self._common_attr(attrs)
+        return attrs
+
+    def node_a(self, attrs):
+        attrs = self._common_attr(attrs)
+        attrs = self._get_link(attrs, "href")
+        attrs = self._set_attr_default(attrs, "target", "_blank")
+        attrs = self._limit_attr(attrs, {
+            "target": ["_blank", "_self"]
+        })
+        return attrs
+
+    def node_embed(self, attrs):
+        attrs = self._common_attr(attrs)
+        attrs = self._get_link(attrs, "src")
+        attrs = self._limit_attr(attrs, {
+            "type": ["application/x-shockwave-flash"],
+            "wmode": ["transparent", "window", "opaque"],
+            "play": ["true", "false"],
+            "loop": ["true", "false"],
+            "menu": ["true", "false"],
+            "allowfullscreen": ["true", "false"]
+        })
+        attrs["allowscriptaccess"] = "never"
+        attrs["allownetworking"] = "none"
+        return attrs
+
+    def _true_url(self, url):
+        prog = re.compile(r"^(http|https|ftp)://.+", re.I | re.S)
+        if prog.match(url):
+            return url
+        else:
+            return "http://%s" % url
+
+    def _true_style(self, style):
+        if style:
+            style = re.sub(r"(\\|&#|/\*|\*/)", "_", style)
+            style = re.sub(r"e.*x.*p.*r.*e.*s.*s.*i.*o.*n", "_", style)
+        return style
+
+    def _get_style(self, attrs):
+        if "style" in attrs:
+            attrs["style"] = self._true_style(attrs.get("style"))
+        return attrs
+
+    def _get_link(self, attrs, name):
+        if name in attrs:
+            attrs[name] = self._true_url(attrs[name])
+        return attrs
+
+    def _wash_attr(self, attrs, tag):
+        if tag in self.tags_own_attrs:
+            other = self.tags_own_attrs.get(tag)
+        else:
+            other = []
+        if attrs:
+            for (key, value) in attrs.items():
+                if key not in self.common_attrs + other:
+                    del attrs[key]
+        return attrs
+
+    def _common_attr(self, attrs):
+        attrs = self._get_style(attrs)
+        return attrs
+
+    def _set_attr_default(self, attrs, name, default=''):
+        if name not in attrs:
+            attrs[name] = default
+        return attrs
+
+    def _limit_attr(self, attrs, limit={}):
+        for (key, value) in limit.items():
+            if key in attrs and attrs[key] not in value:
+                del attrs[key]
+        return attrs
+
+    def _htmlspecialchars(self, html):
+        return html.replace("<", "&lt;") \
+            .replace(">", "&gt;") \
+            .replace('"', "&quot;") \
+            .replace("'", "&#039;")
+
+
+if "__main__" == __name__:
+    parser = XssHtml()
+    parser.feed("""<p><img src=1 onerror=alert(/xss/)></p><div class="left">
+        <a href='javascript:prompt(1)'><br />hehe</a></div>
+        <p id="test" onmouseover="alert(1)">&gt;M<svg>
+        <a href="https://www.baidu.com" target="self">MM</a></p>
+        <embed src='javascript:alert(/hehe/)' allowscriptaccess=always />""")
+    parser.close()
+    print(parser.getHtml())