factory/apps/edu/parse_word.py


from docx import Document
from openpyxl import load_workbook
import re


QUES_CLASS = '安全领域'

OPTION_LIST=[
    [r"A：\s*(\S+)", "D"],
    [r"B：\s*(\S+)", "E"],
    [r"C：\s*(\S+)", "F"],
    [r"D：\s*(\S+)", "G"],
    [r"E：\s*(\S+)", "H"],
    [r"F：\s*(\S+)", "I"],
]

def fill_excel(matches, excel_path, local):
    wb = load_workbook(excel_path)
    ws = wb.active
    if matches:
        ws[local] = matches
    wb.save(excel_path)


def match_text(text, pattern):
    matches = re.search(pattern, text)
    if matches:
        results = matches.group(1)
        return results
    return ''

# 解析word文档
def interpret_text(start:int, excel_path:str, doc_path:str, field=None):
    wordfile = Document(doc_path)
    correct_dict = {}
    option_dict = {}
    question_type = {}
    ques_text = {}
    for index, p in enumerate(wordfile.paragraphs):
        correct_answer = match_text(p.text, r"正确答案：\s*(\S+)")  # 匹配正确答案
        if correct_answer:
            correct_dict.setdefault("correct_answer", []).append(correct_answer)
        for e in OPTION_LIST: # 匹配选项
            result = match_text(p.text, e[0])
            if result:
                    option_dict.setdefault(e[1], []).append(result)
        # 题目类型
        # fill_excel(QUES_CLASS, excel_path, 'B'+str(index+start))
        if p.text[:1]=='【' and p.text[4:5]=='】':
            q_type = p.text[1:3] # 题目类型
            question_type.setdefault("question_type", []).append(q_type)
            if p.text[-2]=='分':  #(3分)
                question_text = p.text[5:-4].strip()
                result = bool(re.match(r'\d+、', question_text)) # 处理题目前的序号
                if result:
                    question = re.sub(r'\d+、', '',question_text)
                    ques_text.setdefault("question_text", []).append(question)
    dict_list = [correct_dict,ques_text,option_dict,question_type]
    for d in dict_list:
        for key,value in d.items():
           if key == "correct_answer":
               for v in range(len(value)):
                   fill_excel(value[v], excel_path, 'J'+str(start+v))
           elif key == "question_type":
               for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'A'+str(start+v))
                    if field:
                        fill_excel(field, excel_path, 'B'+str(start+v))
                    else:
                        fill_excel(QUES_CLASS, excel_path, 'B'+str(start+v))
           elif key == "question_text":
               for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'C'+str(start+v))
           elif key == "D":
                for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'D'+str(start+v))
           elif key == "E":
                for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'E'+str(start+v))
           elif key == "F":
                for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'F'+str(start+v))
           elif key == "G":
                for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'G'+str(start+v))
           elif key == "H":
                for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'H'+str(start+v))
           elif key == "I":
                for v in range(len(value)):
                    fill_excel(value[v], excel_path, 'I'+str(start+v))
    return 'OK'


if __name__ == '__main__':
    doc_path = "C:\code\data\\test.docx"
    excel_path = "C:\code\data\question.xlsx"
    interpret_text(3, excel_path, doc_path)