From a81b183b97fb53b2619e256e3ad257982885ddbd Mon Sep 17 00:00:00 2001
From: caoqianming <caoqianming@foxmail.com>
Date: Tue, 31 Mar 2026 14:15:53 +0800
Subject: [PATCH] fix(screening): avoid false positives from full phrases

---
 mycode/main.py     | 56 ++++++++++++++++++++++++++++++++++-------
 tests/test_main.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_main.py

diff --git a/mycode/main.py b/mycode/main.py
index 6649758..5f076ad 100644
--- a/mycode/main.py
+++ b/mycode/main.py
@@ -7,6 +7,7 @@ from openpyxl import load_workbook
 from urllib.parse import urlparse
 from datetime import datetime
 import numpy as np
+from collections import defaultdict
 
 wechat_dir = os.path.join(BASE_DIR, 'article')
 web_dir = os.path.join(BASE_DIR, 'web_dir')
@@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary')
 df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
 
 
+def split_candidate_phrases(text):
+    if pd.isna(text):
+        return []
+    return [
+        phrase.strip()
+        for phrase in re.split(r'[,，;；、\n]+', str(text))
+        if phrase and phrase.strip()
+    ]
+
+
+def build_phrase_exemption_rules(rules_df):
+    exemption_rules = defaultdict(set)
+    for _, row in rules_df.iterrows():
+        error_phrase = str(row['错误表述']).strip()
+        if not error_phrase:
+            continue
+        for suggestion_phrase in split_candidate_phrases(row['建议修改词语']):
+            if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase:
+                exemption_rules[error_phrase].add(suggestion_phrase)
+    return {
+        error_phrase: sorted(suggestion_phrases, key=len, reverse=True)
+        for error_phrase, suggestion_phrases in exemption_rules.items()
+    }
+
+
+PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s)
+
+
+def should_skip_error_phrase(error_phrase, content, exemption_rules=None):
+    if error_phrase == '20大':
+        return True
+    if not isinstance(content, str) or not content:
+        return False
+    if exemption_rules is None:
+        exemption_rules = PHRASE_EXEMPTION_RULES
+    for full_phrase in exemption_rules.get(error_phrase, []):
+        if full_phrase in content:
+            return True
+    return False
+
+
 def fix_url_scheme(url, default_scheme='http'):
     # 检查URL是否包含方案
     if not url.startswith('http://') and not url.startswith('https://'):
@@ -502,14 +544,12 @@ def ana_wechat():
     index = 1
 
     for ind, row in df_s.iterrows():
-        mask = df['content'].str.contains(row['错误表述'])
+        mask = df['content'].str.contains(row['错误表述'], regex=False)
         result = df[mask]
 
         if not result.empty:
             for ind2, row2 in result.iterrows():
-                if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
-                    continue
-                if row['错误表述'] == '20大':
+                if should_skip_error_phrase(row['错误表述'], row2['content']):
                     continue
                 output_row = [
                     index,
@@ -554,13 +594,11 @@ def ana_web():
         if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
             df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
             for ind, row in df_s.iterrows():
-                mask = df['text'].str.contains(row['错误表述'], na=False)
+                mask = df['text'].str.contains(row['错误表述'], na=False, regex=False)
                 result = df[mask]
                 if not result.empty:
                     for ind2, row2 in result.iterrows():
-                        if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
-                            continue
-                        if row['错误表述'] == '20大':
+                        if should_skip_error_phrase(row['错误表述'], row2['text']):
                             continue
                         output_row = [
                             index,
@@ -616,4 +654,4 @@ if __name__ == "__main__":
         print(df)
         cur.close()
     except Exception as e:
-        pass
\ No newline at end of file
+        pass
diff --git a/tests/test_main.py b/tests/test_main.py
new file mode 100644
index 0000000..35f177e
--- /dev/null
+++ b/tests/test_main.py
@@ -0,0 +1,62 @@
+import unittest
+
+import pandas as pd
+
+from mycode import main
+
+
+class PhraseExemptionRulesTest(unittest.TestCase):
+    def test_builds_exemption_rules_from_suggestion_phrases(self):
+        rules_df = pd.DataFrame([
+            {
+                '错误表述': '深入贯彻中央八项规定精神',
+                '建议修改词语': '深入贯彻中央八项规定精神学习教育',
+                '错误分类': '固定表述错误'
+            },
+            {
+                '错误表述': '“两学一做”学习',
+                '建议修改词语': '“两学一做”学习教育',
+                '错误分类': '固定表述错误'
+            },
+        ])
+
+        exemption_rules = main.build_phrase_exemption_rules(rules_df)
+
+        self.assertEqual(
+            exemption_rules['深入贯彻中央八项规定精神'],
+            ['深入贯彻中央八项规定精神学习教育']
+        )
+        self.assertEqual(
+            exemption_rules['“两学一做”学习'],
+            ['“两学一做”学习教育']
+        )
+
+    def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
+        exemption_rules = {
+            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
+        }
+
+        should_skip = main.should_skip_error_phrase(
+            '深入贯彻中央八项规定精神',
+            '现开展深入贯彻中央八项规定精神学习教育相关工作。',
+            exemption_rules
+        )
+
+        self.assertTrue(should_skip)
+
+    def test_does_not_skip_when_only_short_error_phrase_exists(self):
+        exemption_rules = {
+            '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
+        }
+
+        should_skip = main.should_skip_error_phrase(
+            '深入贯彻中央八项规定精神',
+            '文章仅写到深入贯彻中央八项规定精神，没有写完整。',
+            exemption_rules
+        )
+
+        self.assertFalse(should_skip)
+
+
+if __name__ == '__main__':
+    unittest.main()