fix(screening): avoid false positives from full phrases

2026-03-31 14:15:53 +08:00 · 2026-03-31 14:15:53 +08:00 · a81b183b97
parent 89cf6ccc91
commit a81b183b97
2 changed files with 109 additions and 9 deletions
--- a/mycode/main.py
+++ b/mycode/main.py
@ -7,6 +7,7 @@ from openpyxl import load_workbook
 from urllib.parse import urlparse
 from datetime import datetime
 import numpy as np
 from collections import defaultdict
 wechat_dir = os.path.join(BASE_DIR, 'article')
 web_dir = os.path.join(BASE_DIR, 'web_dir')
@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary')
 df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
 def split_candidate_phrases(text):
    if pd.isna(text):
        return []
    return [
        phrase.strip()
        for phrase in re.split(r'[,，;；、\n]+', str(text))
        if phrase and phrase.strip()
    ]
 def build_phrase_exemption_rules(rules_df):
    exemption_rules = defaultdict(set)
    for _, row in rules_df.iterrows():
        error_phrase = str(row['错误表述']).strip()
        if not error_phrase:
            continue
        for suggestion_phrase in split_candidate_phrases(row['建议修改词语']):
            if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase:
                exemption_rules[error_phrase].add(suggestion_phrase)
    return {
        error_phrase: sorted(suggestion_phrases, key=len, reverse=True)
        for error_phrase, suggestion_phrases in exemption_rules.items()
    }
 PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s)
 def should_skip_error_phrase(error_phrase, content, exemption_rules=None):
    if error_phrase == '20大':
        return True
    if not isinstance(content, str) or not content:
        return False
    if exemption_rules is None:
        exemption_rules = PHRASE_EXEMPTION_RULES
    for full_phrase in exemption_rules.get(error_phrase, []):
        if full_phrase in content:
            return True
    return False
 def fix_url_scheme(url, default_scheme='http'):
    # 检查URL是否包含方案
    if not url.startswith('http://') and not url.startswith('https://'):
@ -502,14 +544,12 @@ def ana_wechat():
    index = 1
    for ind, row in df_s.iterrows():
-        mask = df['content'].str.contains(row['错误表述'])
+        mask = df['content'].str.contains(row['错误表述'], regex=False)
        result = df[mask]
        if not result.empty:
            for ind2, row2 in result.iterrows():
-                if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
+                if should_skip_error_phrase(row['错误表述'], row2['content']):
                    continue
                if row['错误表述'] == '20大':
                    continue
                output_row = [
                    index,
@ -554,13 +594,11 @@ def ana_web():
        if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
            df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
            for ind, row in df_s.iterrows():
-                mask = df['text'].str.contains(row['错误表述'], na=False)
+                mask = df['text'].str.contains(row['错误表述'], na=False, regex=False)
                result = df[mask]
                if not result.empty:
                    for ind2, row2 in result.iterrows():
-                        if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
+                        if should_skip_error_phrase(row['错误表述'], row2['text']):
                            continue
                        if row['错误表述'] == '20大':
                            continue
                        output_row = [
                            index,
@ -616,4 +654,4 @@ if __name__ == "__main__":
        print(df)
        cur.close()
    except Exception as e:
-        pass
+        pass
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -0,0 +1,62 @@
 import unittest
 import pandas as pd
 from mycode import main
 class PhraseExemptionRulesTest(unittest.TestCase):
    def test_builds_exemption_rules_from_suggestion_phrases(self):
        rules_df = pd.DataFrame([
            {
                '错误表述': '深入贯彻中央八项规定精神',
                '建议修改词语': '深入贯彻中央八项规定精神学习教育',
                '错误分类': '固定表述错误'
            },
            {
                '错误表述': '“两学一做”学习',
                '建议修改词语': '“两学一做”学习教育',
                '错误分类': '固定表述错误'
            },
        ])
        exemption_rules = main.build_phrase_exemption_rules(rules_df)
        self.assertEqual(
            exemption_rules['深入贯彻中央八项规定精神'],
            ['深入贯彻中央八项规定精神学习教育']
        )
        self.assertEqual(
            exemption_rules['“两学一做”学习'],
            ['“两学一做”学习教育']
        )
    def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
        exemption_rules = {
            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
        }
        should_skip = main.should_skip_error_phrase(
            '深入贯彻中央八项规定精神',
            '现开展深入贯彻中央八项规定精神学习教育相关工作。',
            exemption_rules
        )
        self.assertTrue(should_skip)
    def test_does_not_skip_when_only_short_error_phrase_exists(self):
        exemption_rules = {
            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
        }
        should_skip = main.should_skip_error_phrase(
            '深入贯彻中央八项规定精神',
            '文章仅写到深入贯彻中央八项规定精神，没有写完整。',
            exemption_rules
        )
        self.assertFalse(should_skip)
 if __name__ == '__main__':
    unittest.main()