fix(screening): avoid false positives from full phrases

2026-03-31 14:15:53 +08:00 · 2026-03-31 14:15:53 +08:00 · a81b183b97
parent 89cf6ccc91
commit a81b183b97
2 changed files with 109 additions and 9 deletions
--- a/mycode/main.py
+++ b/mycode/main.py
@ -7,6 +7,7 @@ from openpyxl import load_workbook
 from urllib.parse import urlparse
 from datetime import datetime
 import numpy as np
+from collections import defaultdict

 wechat_dir = os.path.join(BASE_DIR, 'article')
 web_dir = os.path.join(BASE_DIR, 'web_dir')
@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary')
 df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')


+def split_candidate_phrases(text):
+    if pd.isna(text):
+        return []
+    return [
+        phrase.strip()
+        for phrase in re.split(r'[,，;；、\n]+', str(text))
+        if phrase and phrase.strip()
+    ]
+
+
+def build_phrase_exemption_rules(rules_df):
+    exemption_rules = defaultdict(set)
+    for _, row in rules_df.iterrows():
+        error_phrase = str(row['错误表述']).strip()
+        if not error_phrase:
+            continue
+        for suggestion_phrase in split_candidate_phrases(row['建议修改词语']):
+            if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase:
+                exemption_rules[error_phrase].add(suggestion_phrase)
+    return {
+        error_phrase: sorted(suggestion_phrases, key=len, reverse=True)
+        for error_phrase, suggestion_phrases in exemption_rules.items()
+    }
+
+
+PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s)
+
+
+def should_skip_error_phrase(error_phrase, content, exemption_rules=None):
+    if error_phrase == '20大':
+        return True
+    if not isinstance(content, str) or not content:
+        return False
+    if exemption_rules is None:
+        exemption_rules = PHRASE_EXEMPTION_RULES
+    for full_phrase in exemption_rules.get(error_phrase, []):
+        if full_phrase in content:
+            return True
+    return False
+
+
 def fix_url_scheme(url, default_scheme='http'):
    # 检查URL是否包含方案
    if not url.startswith('http://') and not url.startswith('https://'):
@ -502,14 +544,12 @@ def ana_wechat():
    index = 1

    for ind, row in df_s.iterrows():
-        mask = df['content'].str.contains(row['错误表述'])
+        mask = df['content'].str.contains(row['错误表述'], regex=False)
        result = df[mask]

        if not result.empty:
            for ind2, row2 in result.iterrows():
-                if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
-                    continue
-                if row['错误表述'] == '20大':
+                if should_skip_error_phrase(row['错误表述'], row2['content']):
                    continue
                output_row = [
                    index,
@ -554,13 +594,11 @@ def ana_web():
        if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
            df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
            for ind, row in df_s.iterrows():
-                mask = df['text'].str.contains(row['错误表述'], na=False)
+                mask = df['text'].str.contains(row['错误表述'], na=False, regex=False)
                result = df[mask]
                if not result.empty:
                    for ind2, row2 in result.iterrows():
-                        if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
-                            continue
-                        if row['错误表述'] == '20大':
+                        if should_skip_error_phrase(row['错误表述'], row2['text']):
                            continue
                        output_row = [
                            index,
@ -616,4 +654,4 @@ if __name__ == "__main__":
        print(df)
        cur.close()
    except Exception as e:
-        pass
+        pass
--- a/tests/test_main.py
+++ b/tests/test_main.py
@ -0,0 +1,62 @@
+import unittest
+
+import pandas as pd
+
+from mycode import main
+
+
+class PhraseExemptionRulesTest(unittest.TestCase):
+    def test_builds_exemption_rules_from_suggestion_phrases(self):
+        rules_df = pd.DataFrame([
+            {
+                '错误表述': '深入贯彻中央八项规定精神',
+                '建议修改词语': '深入贯彻中央八项规定精神学习教育',
+                '错误分类': '固定表述错误'
+            },
+            {
+                '错误表述': '“两学一做”学习',
+                '建议修改词语': '“两学一做”学习教育',
+                '错误分类': '固定表述错误'
+            },
+        ])
+
+        exemption_rules = main.build_phrase_exemption_rules(rules_df)
+
+        self.assertEqual(
+            exemption_rules['深入贯彻中央八项规定精神'],
+            ['深入贯彻中央八项规定精神学习教育']
+        )
+        self.assertEqual(
+            exemption_rules['“两学一做”学习'],
+            ['“两学一做”学习教育']
+        )
+
+    def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
+        exemption_rules = {
+            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
+        }
+
+        should_skip = main.should_skip_error_phrase(
+            '深入贯彻中央八项规定精神',
+            '现开展深入贯彻中央八项规定精神学习教育相关工作。',
+            exemption_rules
+        )
+
+        self.assertTrue(should_skip)
+
+    def test_does_not_skip_when_only_short_error_phrase_exists(self):
+        exemption_rules = {
+            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
+        }
+
+        should_skip = main.should_skip_error_phrase(
+            '深入贯彻中央八项规定精神',
+            '文章仅写到深入贯彻中央八项规定精神，没有写完整。',
+            exemption_rules
+        )
+
+        self.assertFalse(should_skip)
+
+
+if __name__ == '__main__':
+    unittest.main()