From a81b183b97fb53b2619e256e3ad257982885ddbd Mon Sep 17 00:00:00 2001 From: caoqianming Date: Tue, 31 Mar 2026 14:15:53 +0800 Subject: [PATCH] fix(screening): avoid false positives from full phrases --- mycode/main.py | 56 ++++++++++++++++++++++++++++++++++------- tests/test_main.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 9 deletions(-) create mode 100644 tests/test_main.py diff --git a/mycode/main.py b/mycode/main.py index 6649758..5f076ad 100644 --- a/mycode/main.py +++ b/mycode/main.py @@ -7,6 +7,7 @@ from openpyxl import load_workbook from urllib.parse import urlparse from datetime import datetime import numpy as np +from collections import defaultdict wechat_dir = os.path.join(BASE_DIR, 'article') web_dir = os.path.join(BASE_DIR, 'web_dir') @@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary') df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容') +def split_candidate_phrases(text): + if pd.isna(text): + return [] + return [ + phrase.strip() + for phrase in re.split(r'[,,;;、\n]+', str(text)) + if phrase and phrase.strip() + ] + + +def build_phrase_exemption_rules(rules_df): + exemption_rules = defaultdict(set) + for _, row in rules_df.iterrows(): + error_phrase = str(row['错误表述']).strip() + if not error_phrase: + continue + for suggestion_phrase in split_candidate_phrases(row['建议修改词语']): + if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase: + exemption_rules[error_phrase].add(suggestion_phrase) + return { + error_phrase: sorted(suggestion_phrases, key=len, reverse=True) + for error_phrase, suggestion_phrases in exemption_rules.items() + } + + +PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s) + + +def should_skip_error_phrase(error_phrase, content, exemption_rules=None): + if error_phrase == '20大': + return True + if not isinstance(content, str) or not content: + return False + if exemption_rules is None: + exemption_rules = PHRASE_EXEMPTION_RULES + for full_phrase in exemption_rules.get(error_phrase, []): + if full_phrase in content: + return True + return False + + def fix_url_scheme(url, default_scheme='http'): # 检查URL是否包含方案 if not url.startswith('http://') and not url.startswith('https://'): @@ -502,14 +544,12 @@ def ana_wechat(): index = 1 for ind, row in df_s.iterrows(): - mask = df['content'].str.contains(row['错误表述']) + mask = df['content'].str.contains(row['错误表述'], regex=False) result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): - if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']: - continue - if row['错误表述'] == '20大': + if should_skip_error_phrase(row['错误表述'], row2['content']): continue output_row = [ index, @@ -554,13 +594,11 @@ def ana_web(): if os.path.exists(full_path) and os.path.getsize(full_path) > 0: df = pd.read_excel(os.path.join(full_path), engine='openpyxl') for ind, row in df_s.iterrows(): - mask = df['text'].str.contains(row['错误表述'], na=False) + mask = df['text'].str.contains(row['错误表述'], na=False, regex=False) result = df[mask] if not result.empty: for ind2, row2 in result.iterrows(): - if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']: - continue - if row['错误表述'] == '20大': + if should_skip_error_phrase(row['错误表述'], row2['text']): continue output_row = [ index, @@ -616,4 +654,4 @@ if __name__ == "__main__": print(df) cur.close() except Exception as e: - pass \ No newline at end of file + pass diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..35f177e --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,62 @@ +import unittest + +import pandas as pd + +from mycode import main + + +class PhraseExemptionRulesTest(unittest.TestCase): + def test_builds_exemption_rules_from_suggestion_phrases(self): + rules_df = pd.DataFrame([ + { + '错误表述': '深入贯彻中央八项规定精神', + '建议修改词语': '深入贯彻中央八项规定精神学习教育', + '错误分类': '固定表述错误' + }, + { + '错误表述': '“两学一做”学习', + '建议修改词语': '“两学一做”学习教育', + '错误分类': '固定表述错误' + }, + ]) + + exemption_rules = main.build_phrase_exemption_rules(rules_df) + + self.assertEqual( + exemption_rules['深入贯彻中央八项规定精神'], + ['深入贯彻中央八项规定精神学习教育'] + ) + self.assertEqual( + exemption_rules['“两学一做”学习'], + ['“两学一做”学习教育'] + ) + + def test_skips_short_error_phrase_when_full_correct_phrase_exists(self): + exemption_rules = { + '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育'] + } + + should_skip = main.should_skip_error_phrase( + '深入贯彻中央八项规定精神', + '现开展深入贯彻中央八项规定精神学习教育相关工作。', + exemption_rules + ) + + self.assertTrue(should_skip) + + def test_does_not_skip_when_only_short_error_phrase_exists(self): + exemption_rules = { + '深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育'] + } + + should_skip = main.should_skip_error_phrase( + '深入贯彻中央八项规定精神', + '文章仅写到深入贯彻中央八项规定精神,没有写完整。', + exemption_rules + ) + + self.assertFalse(should_skip) + + +if __name__ == '__main__': + unittest.main()