fix(screening): avoid false positives from full phrases
This commit is contained in:
parent
89cf6ccc91
commit
a81b183b97
|
|
@ -7,6 +7,7 @@ from openpyxl import load_workbook
|
|||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||
|
|
@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary')
|
|||
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
||||
|
||||
|
||||
def split_candidate_phrases(text):
|
||||
if pd.isna(text):
|
||||
return []
|
||||
return [
|
||||
phrase.strip()
|
||||
for phrase in re.split(r'[,,;;、\n]+', str(text))
|
||||
if phrase and phrase.strip()
|
||||
]
|
||||
|
||||
|
||||
def build_phrase_exemption_rules(rules_df):
|
||||
exemption_rules = defaultdict(set)
|
||||
for _, row in rules_df.iterrows():
|
||||
error_phrase = str(row['错误表述']).strip()
|
||||
if not error_phrase:
|
||||
continue
|
||||
for suggestion_phrase in split_candidate_phrases(row['建议修改词语']):
|
||||
if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase:
|
||||
exemption_rules[error_phrase].add(suggestion_phrase)
|
||||
return {
|
||||
error_phrase: sorted(suggestion_phrases, key=len, reverse=True)
|
||||
for error_phrase, suggestion_phrases in exemption_rules.items()
|
||||
}
|
||||
|
||||
|
||||
PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s)
|
||||
|
||||
|
||||
def should_skip_error_phrase(error_phrase, content, exemption_rules=None):
|
||||
if error_phrase == '20大':
|
||||
return True
|
||||
if not isinstance(content, str) or not content:
|
||||
return False
|
||||
if exemption_rules is None:
|
||||
exemption_rules = PHRASE_EXEMPTION_RULES
|
||||
for full_phrase in exemption_rules.get(error_phrase, []):
|
||||
if full_phrase in content:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def fix_url_scheme(url, default_scheme='http'):
|
||||
# 检查URL是否包含方案
|
||||
if not url.startswith('http://') and not url.startswith('https://'):
|
||||
|
|
@ -502,14 +544,12 @@ def ana_wechat():
|
|||
index = 1
|
||||
|
||||
for ind, row in df_s.iterrows():
|
||||
mask = df['content'].str.contains(row['错误表述'])
|
||||
mask = df['content'].str.contains(row['错误表述'], regex=False)
|
||||
result = df[mask]
|
||||
|
||||
if not result.empty:
|
||||
for ind2, row2 in result.iterrows():
|
||||
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
|
||||
continue
|
||||
if row['错误表述'] == '20大':
|
||||
if should_skip_error_phrase(row['错误表述'], row2['content']):
|
||||
continue
|
||||
output_row = [
|
||||
index,
|
||||
|
|
@ -554,13 +594,11 @@ def ana_web():
|
|||
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
|
||||
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
|
||||
for ind, row in df_s.iterrows():
|
||||
mask = df['text'].str.contains(row['错误表述'], na=False)
|
||||
mask = df['text'].str.contains(row['错误表述'], na=False, regex=False)
|
||||
result = df[mask]
|
||||
if not result.empty:
|
||||
for ind2, row2 in result.iterrows():
|
||||
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
|
||||
continue
|
||||
if row['错误表述'] == '20大':
|
||||
if should_skip_error_phrase(row['错误表述'], row2['text']):
|
||||
continue
|
||||
output_row = [
|
||||
index,
|
||||
|
|
@ -616,4 +654,4 @@ if __name__ == "__main__":
|
|||
print(df)
|
||||
cur.close()
|
||||
except Exception as e:
|
||||
pass
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
import unittest
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from mycode import main
|
||||
|
||||
|
||||
class PhraseExemptionRulesTest(unittest.TestCase):
|
||||
def test_builds_exemption_rules_from_suggestion_phrases(self):
|
||||
rules_df = pd.DataFrame([
|
||||
{
|
||||
'错误表述': '深入贯彻中央八项规定精神',
|
||||
'建议修改词语': '深入贯彻中央八项规定精神学习教育',
|
||||
'错误分类': '固定表述错误'
|
||||
},
|
||||
{
|
||||
'错误表述': '“两学一做”学习',
|
||||
'建议修改词语': '“两学一做”学习教育',
|
||||
'错误分类': '固定表述错误'
|
||||
},
|
||||
])
|
||||
|
||||
exemption_rules = main.build_phrase_exemption_rules(rules_df)
|
||||
|
||||
self.assertEqual(
|
||||
exemption_rules['深入贯彻中央八项规定精神'],
|
||||
['深入贯彻中央八项规定精神学习教育']
|
||||
)
|
||||
self.assertEqual(
|
||||
exemption_rules['“两学一做”学习'],
|
||||
['“两学一做”学习教育']
|
||||
)
|
||||
|
||||
def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
|
||||
exemption_rules = {
|
||||
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
|
||||
}
|
||||
|
||||
should_skip = main.should_skip_error_phrase(
|
||||
'深入贯彻中央八项规定精神',
|
||||
'现开展深入贯彻中央八项规定精神学习教育相关工作。',
|
||||
exemption_rules
|
||||
)
|
||||
|
||||
self.assertTrue(should_skip)
|
||||
|
||||
def test_does_not_skip_when_only_short_error_phrase_exists(self):
|
||||
exemption_rules = {
|
||||
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
|
||||
}
|
||||
|
||||
should_skip = main.should_skip_error_phrase(
|
||||
'深入贯彻中央八项规定精神',
|
||||
'文章仅写到深入贯彻中央八项规定精神,没有写完整。',
|
||||
exemption_rules
|
||||
)
|
||||
|
||||
self.assertFalse(should_skip)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Loading…
Reference in New Issue