fix(screening): avoid false positives from full phrases
This commit is contained in:
parent
89cf6ccc91
commit
a81b183b97
|
|
@ -7,6 +7,7 @@ from openpyxl import load_workbook
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
wechat_dir = os.path.join(BASE_DIR, 'article')
|
wechat_dir = os.path.join(BASE_DIR, 'article')
|
||||||
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
web_dir = os.path.join(BASE_DIR, 'web_dir')
|
||||||
|
|
@ -14,6 +15,47 @@ output_dir = os.path.join(BASE_DIR, 'summary')
|
||||||
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
df_s = pd.read_excel(os.path.join(BASE_DIR, 'biao.xlsx'), sheet_name='筛查内容')
|
||||||
|
|
||||||
|
|
||||||
|
def split_candidate_phrases(text):
|
||||||
|
if pd.isna(text):
|
||||||
|
return []
|
||||||
|
return [
|
||||||
|
phrase.strip()
|
||||||
|
for phrase in re.split(r'[,,;;、\n]+', str(text))
|
||||||
|
if phrase and phrase.strip()
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def build_phrase_exemption_rules(rules_df):
|
||||||
|
exemption_rules = defaultdict(set)
|
||||||
|
for _, row in rules_df.iterrows():
|
||||||
|
error_phrase = str(row['错误表述']).strip()
|
||||||
|
if not error_phrase:
|
||||||
|
continue
|
||||||
|
for suggestion_phrase in split_candidate_phrases(row['建议修改词语']):
|
||||||
|
if suggestion_phrase != error_phrase and error_phrase in suggestion_phrase:
|
||||||
|
exemption_rules[error_phrase].add(suggestion_phrase)
|
||||||
|
return {
|
||||||
|
error_phrase: sorted(suggestion_phrases, key=len, reverse=True)
|
||||||
|
for error_phrase, suggestion_phrases in exemption_rules.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PHRASE_EXEMPTION_RULES = build_phrase_exemption_rules(df_s)
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_error_phrase(error_phrase, content, exemption_rules=None):
|
||||||
|
if error_phrase == '20大':
|
||||||
|
return True
|
||||||
|
if not isinstance(content, str) or not content:
|
||||||
|
return False
|
||||||
|
if exemption_rules is None:
|
||||||
|
exemption_rules = PHRASE_EXEMPTION_RULES
|
||||||
|
for full_phrase in exemption_rules.get(error_phrase, []):
|
||||||
|
if full_phrase in content:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def fix_url_scheme(url, default_scheme='http'):
|
def fix_url_scheme(url, default_scheme='http'):
|
||||||
# 检查URL是否包含方案
|
# 检查URL是否包含方案
|
||||||
if not url.startswith('http://') and not url.startswith('https://'):
|
if not url.startswith('http://') and not url.startswith('https://'):
|
||||||
|
|
@ -502,14 +544,12 @@ def ana_wechat():
|
||||||
index = 1
|
index = 1
|
||||||
|
|
||||||
for ind, row in df_s.iterrows():
|
for ind, row in df_s.iterrows():
|
||||||
mask = df['content'].str.contains(row['错误表述'])
|
mask = df['content'].str.contains(row['错误表述'], regex=False)
|
||||||
result = df[mask]
|
result = df[mask]
|
||||||
|
|
||||||
if not result.empty:
|
if not result.empty:
|
||||||
for ind2, row2 in result.iterrows():
|
for ind2, row2 in result.iterrows():
|
||||||
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['content']:
|
if should_skip_error_phrase(row['错误表述'], row2['content']):
|
||||||
continue
|
|
||||||
if row['错误表述'] == '20大':
|
|
||||||
continue
|
continue
|
||||||
output_row = [
|
output_row = [
|
||||||
index,
|
index,
|
||||||
|
|
@ -554,13 +594,11 @@ def ana_web():
|
||||||
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
|
if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
|
||||||
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
|
df = pd.read_excel(os.path.join(full_path), engine='openpyxl')
|
||||||
for ind, row in df_s.iterrows():
|
for ind, row in df_s.iterrows():
|
||||||
mask = df['text'].str.contains(row['错误表述'], na=False)
|
mask = df['text'].str.contains(row['错误表述'], na=False, regex=False)
|
||||||
result = df[mask]
|
result = df[mask]
|
||||||
if not result.empty:
|
if not result.empty:
|
||||||
for ind2, row2 in result.iterrows():
|
for ind2, row2 in result.iterrows():
|
||||||
if row['错误表述'] == '“两学一做”学习' and '“两学一做”学习教育' in row2['text']:
|
if should_skip_error_phrase(row['错误表述'], row2['text']):
|
||||||
continue
|
|
||||||
if row['错误表述'] == '20大':
|
|
||||||
continue
|
continue
|
||||||
output_row = [
|
output_row = [
|
||||||
index,
|
index,
|
||||||
|
|
@ -616,4 +654,4 @@ if __name__ == "__main__":
|
||||||
print(df)
|
print(df)
|
||||||
cur.close()
|
cur.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from mycode import main
|
||||||
|
|
||||||
|
|
||||||
|
class PhraseExemptionRulesTest(unittest.TestCase):
|
||||||
|
def test_builds_exemption_rules_from_suggestion_phrases(self):
|
||||||
|
rules_df = pd.DataFrame([
|
||||||
|
{
|
||||||
|
'错误表述': '深入贯彻中央八项规定精神',
|
||||||
|
'建议修改词语': '深入贯彻中央八项规定精神学习教育',
|
||||||
|
'错误分类': '固定表述错误'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'错误表述': '“两学一做”学习',
|
||||||
|
'建议修改词语': '“两学一做”学习教育',
|
||||||
|
'错误分类': '固定表述错误'
|
||||||
|
},
|
||||||
|
])
|
||||||
|
|
||||||
|
exemption_rules = main.build_phrase_exemption_rules(rules_df)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
exemption_rules['深入贯彻中央八项规定精神'],
|
||||||
|
['深入贯彻中央八项规定精神学习教育']
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
exemption_rules['“两学一做”学习'],
|
||||||
|
['“两学一做”学习教育']
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_skips_short_error_phrase_when_full_correct_phrase_exists(self):
|
||||||
|
exemption_rules = {
|
||||||
|
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
|
||||||
|
}
|
||||||
|
|
||||||
|
should_skip = main.should_skip_error_phrase(
|
||||||
|
'深入贯彻中央八项规定精神',
|
||||||
|
'现开展深入贯彻中央八项规定精神学习教育相关工作。',
|
||||||
|
exemption_rules
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertTrue(should_skip)
|
||||||
|
|
||||||
|
def test_does_not_skip_when_only_short_error_phrase_exists(self):
|
||||||
|
exemption_rules = {
|
||||||
|
'深入贯彻中央八项规定精神': ['深入贯彻中央八项规定精神学习教育']
|
||||||
|
}
|
||||||
|
|
||||||
|
should_skip = main.should_skip_error_phrase(
|
||||||
|
'深入贯彻中央八项规定精神',
|
||||||
|
'文章仅写到深入贯彻中央八项规定精神,没有写完整。',
|
||||||
|
exemption_rules
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertFalse(should_skip)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
Loading…
Reference in New Issue