182 lines
6.0 KiB
Python
182 lines
6.0 KiB
Python
import unittest
|
|
|
|
import pandas as pd
|
|
|
|
from mycode import main
|
|
|
|
|
|
class PhraseExemptionRulesTest(unittest.TestCase):
|
|
def test_builds_exemption_rules_from_suggestion_phrases(self):
|
|
rules_df = pd.DataFrame([
|
|
{
|
|
'错误表述': '深入贯彻中央八项规定精神',
|
|
'建议修改词语': '深入贯彻中央八项规定精神学习教育',
|
|
'错误分类': '固定表述错误'
|
|
},
|
|
{
|
|
'错误表述': '“两学一做”学习',
|
|
'建议修改词语': '“两学一做”学习教育',
|
|
'错误分类': '固定表述错误'
|
|
},
|
|
])
|
|
|
|
exemption_rules = main.build_phrase_exemption_rules(rules_df)
|
|
|
|
self.assertEqual(
|
|
exemption_rules['深入贯彻中央八项规定精神'],
|
|
['深入贯彻中央八项规定精神学习教育']
|
|
)
|
|
self.assertEqual(
|
|
exemption_rules['“两学一做”学习'],
|
|
['“两学一做”学习教育']
|
|
)
|
|
|
|
def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
|
|
exemption_rules = {
|
|
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
|
|
}
|
|
|
|
should_skip = main.should_skip_error_phrase(
|
|
'深入贯彻中央八项规定精神',
|
|
'现开展深入贯彻中央八项规定精神学习教育相关工作。',
|
|
exemption_rules
|
|
)
|
|
|
|
self.assertTrue(should_skip)
|
|
|
|
def test_does_not_skip_when_only_short_error_phrase_exists(self):
|
|
exemption_rules = {
|
|
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
|
|
}
|
|
|
|
should_skip = main.should_skip_error_phrase(
|
|
'深入贯彻中央八项规定精神',
|
|
'文章仅写到深入贯彻中央八项规定精神,没有写完整。',
|
|
exemption_rules
|
|
)
|
|
|
|
self.assertFalse(should_skip)
|
|
|
|
|
|
class DeletedWechatContentFilterTest(unittest.TestCase):
|
|
def test_filters_rows_when_url_page_says_deleted(self):
|
|
rows = [
|
|
[1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://a.test/1'],
|
|
[2, '公众号B', '标题B', '错误表述B', '建议B', '分类B', 'https://b.test/2'],
|
|
]
|
|
|
|
def fetch_html(url):
|
|
if url == 'https://a.test/1':
|
|
return '该内容已被发布者删除'
|
|
return '正常文章内容'
|
|
|
|
filtered_rows = main.filter_deleted_wechat_rows(rows, fetch_html=fetch_html)
|
|
|
|
self.assertEqual(
|
|
filtered_rows,
|
|
[[2, '公众号B', '标题B', '错误表述B', '建议B', '分类B', 'https://b.test/2']]
|
|
)
|
|
|
|
def test_keeps_rows_when_url_page_is_not_deleted(self):
|
|
rows = [
|
|
[1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://a.test/1'],
|
|
]
|
|
|
|
filtered_rows = main.filter_deleted_wechat_rows(
|
|
rows,
|
|
fetch_html=lambda url: '正文还在'
|
|
)
|
|
|
|
self.assertEqual(filtered_rows, rows)
|
|
|
|
def test_checks_same_url_only_once(self):
|
|
rows = [
|
|
[1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://same.test/1'],
|
|
[2, '公众号A', '标题B', '错误表述B', '建议B', '分类B', 'https://same.test/1'],
|
|
]
|
|
calls = []
|
|
|
|
def fetch_html(url):
|
|
calls.append(url)
|
|
return '正常文章内容'
|
|
|
|
filtered_rows = main.filter_deleted_wechat_rows(rows, fetch_html=fetch_html)
|
|
|
|
self.assertEqual(filtered_rows, rows)
|
|
self.assertEqual(calls, ['https://same.test/1'])
|
|
|
|
def test_keeps_rows_when_fetch_fails(self):
|
|
rows = [
|
|
[1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://a.test/1'],
|
|
]
|
|
|
|
def fetch_html(url):
|
|
raise TimeoutError(url)
|
|
|
|
filtered_rows = main.filter_deleted_wechat_rows(
|
|
rows,
|
|
fetch_html=fetch_html
|
|
)
|
|
|
|
self.assertEqual(filtered_rows, rows)
|
|
|
|
def test_reports_url_check_progress(self):
|
|
rows = [
|
|
[1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://a.test/1'],
|
|
[2, '公众号B', '标题B', '错误表述B', '建议B', '分类B', 'https://b.test/2'],
|
|
[3, '公众号A', '标题C', '错误表述C', '建议C', '分类C', 'https://a.test/1'],
|
|
]
|
|
progress = []
|
|
|
|
main.filter_deleted_wechat_rows(
|
|
rows,
|
|
fetch_html=lambda url: '正常文章内容',
|
|
progress_callback=lambda completed, total: progress.append(
|
|
(completed, total)
|
|
)
|
|
)
|
|
|
|
self.assertEqual(progress[-1], (2, 2))
|
|
self.assertEqual(len(progress), 2)
|
|
|
|
|
|
class WechatAnalysisProgressTest(unittest.TestCase):
|
|
def test_reports_rule_scan_progress(self):
|
|
rules_df = pd.DataFrame([
|
|
{
|
|
'错误表述': '错误A',
|
|
'建议修改词语': '修改A',
|
|
'错误分类': '分类A'
|
|
},
|
|
{
|
|
'错误表述': '错误B',
|
|
'建议修改词语': '修改B',
|
|
'错误分类': '分类B'
|
|
},
|
|
])
|
|
articles_df = pd.DataFrame([
|
|
{
|
|
'nickname': '公众号A',
|
|
'title': '标题A',
|
|
'content': '这里包含错误A',
|
|
'content_url': 'https://a.test/1'
|
|
}
|
|
])
|
|
progress = []
|
|
|
|
rows = main.ana_wechat(
|
|
rules_df=rules_df,
|
|
articles_df=articles_df,
|
|
progress_callback=lambda completed, total: progress.append(
|
|
(completed, total)
|
|
),
|
|
fetch_html=lambda url: '正常文章内容'
|
|
)
|
|
|
|
self.assertEqual(len(rows), 1)
|
|
self.assertEqual(progress[-1], (2, 2))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|