zcspider/tests/test_main.py

import unittest

import pandas as pd

from mycode import main


class PhraseExemptionRulesTest(unittest.TestCase):
    def test_builds_exemption_rules_from_suggestion_phrases(self):
        rules_df = pd.DataFrame([
            {
                '错误表述': '深入贯彻中央八项规定精神',
                '建议修改词语': '深入贯彻中央八项规定精神学习教育',
                '错误分类': '固定表述错误'
            },
            {
                '错误表述': '“两学一做”学习',
                '建议修改词语': '“两学一做”学习教育',
                '错误分类': '固定表述错误'
            },
        ])

        exemption_rules = main.build_phrase_exemption_rules(rules_df)

        self.assertEqual(
            exemption_rules['深入贯彻中央八项规定精神'],
            ['深入贯彻中央八项规定精神学习教育']
        )
        self.assertEqual(
            exemption_rules['“两学一做”学习'],
            ['“两学一做”学习教育']
        )

    def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
        exemption_rules = {
            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
        }

        should_skip = main.should_skip_error_phrase(
            '深入贯彻中央八项规定精神',
            '现开展深入贯彻中央八项规定精神学习教育相关工作。',
            exemption_rules
        )

        self.assertTrue(should_skip)

    def test_does_not_skip_when_only_short_error_phrase_exists(self):
        exemption_rules = {
            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
        }

        should_skip = main.should_skip_error_phrase(
            '深入贯彻中央八项规定精神',
            '文章仅写到深入贯彻中央八项规定精神，没有写完整。',
            exemption_rules
        )

        self.assertFalse(should_skip)


class DeletedWechatContentFilterTest(unittest.TestCase):
    def test_filters_rows_when_url_page_says_deleted(self):
        rows = [
            [1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://a.test/1'],
            [2, '公众号B', '标题B', '错误表述B', '建议B', '分类B', 'https://b.test/2'],
        ]

        def fetch_html(url):
            if url == 'https://a.test/1':
                return '该内容已被发布者删除'
            return '正常文章内容'

        filtered_rows = main.filter_deleted_wechat_rows(rows, fetch_html=fetch_html)

        self.assertEqual(
            filtered_rows,
            [[2, '公众号B', '标题B', '错误表述B', '建议B', '分类B', 'https://b.test/2']]
        )

    def test_keeps_rows_when_url_page_is_not_deleted(self):
        rows = [
            [1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://a.test/1'],
        ]

        filtered_rows = main.filter_deleted_wechat_rows(
            rows,
            fetch_html=lambda url: '正文还在'
        )

        self.assertEqual(filtered_rows, rows)

    def test_checks_same_url_only_once(self):
        rows = [
            [1, '公众号A', '标题A', '错误表述A', '建议A', '分类A', 'https://same.test/1'],
            [2, '公众号A', '标题B', '错误表述B', '建议B', '分类B', 'https://same.test/1'],
        ]
        calls = []

        def fetch_html(url):
            calls.append(url)
            return '正常文章内容'

        filtered_rows = main.filter_deleted_wechat_rows(rows, fetch_html=fetch_html)

        self.assertEqual(filtered_rows, rows)
        self.assertEqual(calls, ['https://same.test/1'])


if __name__ == '__main__':
    unittest.main()