fix(screening): avoid false positives from full phrases

This commit is contained in:
caoqianming 2026-03-31 14:15:53 +08:00
parent 89cf6ccc91
commit a81b183b97
2 changed files with 109 additions and 9 deletions

View File

@ -7,6 +7,7 @@ from openpyxl import load_workbook
from urllib.parse import urlparse
from datetime import datetime
import numpy as np
from collections import defaultdict
wechat_dir = os.path.join(BASE_DIR, 'article')
web_dir = os.path.join(BASE_DIR, 'web_dir')
@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary')
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
def split_candidate_phrases(text):
if pd.isna(text):
return []
return [
phrase.strip()
for phrase in re.split(r'[,;;、\n]+', str(text))
if phrase and phrase.strip()
]
def build_phrase_exemption_rules(rules_df):
exemption_rules = defaultdict(set)
for _, row in rules_df.iterrows():
error_phrase = str(row['错误表述']).strip()
if not error_phrase:
continue
for suggestion_phrase in split_candidate_phrases(row['建议修改词语']):
if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase:
exemption_rules[error_phrase].add(suggestion_phrase)
return {
error_phrase: sorted(suggestion_phrases, key=len, reverse=True)
for error_phrase, suggestion_phrases in exemption_rules.items()
}
PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s)
def should_skip_error_phrase(error_phrase, content, exemption_rules=None):
if error_phrase == '20大':
return True
if not isinstance(content, str) or not content:
return False
if exemption_rules is None:
exemption_rules = PHRASE_EXEMPTION_RULES
for full_phrase in exemption_rules.get(error_phrase, []):
if full_phrase in content:
return True
return False
def fix_url_scheme(url, default_scheme='http'):
# 检查URL是否包含方案
if not url.startswith('http://') and not url.startswith('https://'):
@ -502,14 +544,12 @@ def ana_wechat():
index = 1
for ind, row in df_s.iterrows():
mask = df['content'].str.contains(row['错误表述'])
mask = df['content'].str.contains(row['错误表述'], regex=False)
result = df[mask]
if not result.empty:
for ind2, row2 in result.iterrows():
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
continue
if row['错误表述'] == '20大':
if should_skip_error_phrase(row['错误表述'], row2['content']):
continue
output_row = [
index,
@ -554,13 +594,11 @@ def ana_web():
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
for ind, row in df_s.iterrows():
mask = df['text'].str.contains(row['错误表述'], na=False)
mask = df['text'].str.contains(row['错误表述'], na=False, regex=False)
result = df[mask]
if not result.empty:
for ind2, row2 in result.iterrows():
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
continue
if row['错误表述'] == '20大':
if should_skip_error_phrase(row['错误表述'], row2['text']):
continue
output_row = [
index,
@ -616,4 +654,4 @@ if __name__ == "__main__":
print(df)
cur.close()
except Exception as e:
pass
pass

62
tests/test_main.py Normal file
View File

@ -0,0 +1,62 @@
import unittest
import pandas as pd
from mycode import main
class PhraseExemptionRulesTest(unittest.TestCase):
def test_builds_exemption_rules_from_suggestion_phrases(self):
rules_df = pd.DataFrame([
{
'错误表述': '深入贯彻中央八项规定精神',
'建议修改词语': '深入贯彻中央八项规定精神学习教育',
'错误分类': '固定表述错误'
},
{
'错误表述': '“两学一做”学习',
'建议修改词语': '“两学一做”学习教育',
'错误分类': '固定表述错误'
},
])
exemption_rules = main.build_phrase_exemption_rules(rules_df)
self.assertEqual(
exemption_rules['深入贯彻中央八项规定精神'],
['深入贯彻中央八项规定精神学习教育']
)
self.assertEqual(
exemption_rules['“两学一做”学习'],
['“两学一做”学习教育']
)
def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
exemption_rules = {
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
}
should_skip = main.should_skip_error_phrase(
'深入贯彻中央八项规定精神',
'现开展深入贯彻中央八项规定精神学习教育相关工作。',
exemption_rules
)
self.assertTrue(should_skip)
def test_does_not_skip_when_only_short_error_phrase_exists(self):
exemption_rules = {
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
}
should_skip = main.should_skip_error_phrase(
'深入贯彻中央八项规定精神',
'文章仅写到深入贯彻中央八项规定精神,没有写完整。',
exemption_rules
)
self.assertFalse(should_skip)
if __name__ == '__main__':
unittest.main()