From b087a94335acbc060adb1fbccb9a6cff3a393d50 Mon Sep 17 00:00:00 2001 From: caoqianming Date: Fri, 16 Jan 2026 16:58:18 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=B4=B7=E5=90=8E?= =?UTF-8?q?=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/carbon/migrations/0002_fingerprint.py | 33 +++++++++ apps/carbon/models.py | 26 ++++++- apps/carbon/service.py | 16 ++++- apps/carbon/views.py | 82 ++++++++++++++++++++-- 4 files changed, 147 insertions(+), 10 deletions(-) create mode 100644 apps/carbon/migrations/0002_fingerprint.py diff --git a/apps/carbon/migrations/0002_fingerprint.py b/apps/carbon/migrations/0002_fingerprint.py new file mode 100644 index 0000000..27558fa --- /dev/null +++ b/apps/carbon/migrations/0002_fingerprint.py @@ -0,0 +1,33 @@ +# Generated by Django 4.2.27 on 2026-01-16 08:34 + +from django.db import migrations, models +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ('carbon', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Fingerprint', + fields=[ + ('id', models.CharField(editable=False, help_text='主键ID', max_length=20, primary_key=True, serialize=False, verbose_name='主键ID')), + ('create_time', models.DateTimeField(default=django.utils.timezone.now, help_text='创建时间', verbose_name='创建时间')), + ('update_time', models.DateTimeField(auto_now=True, help_text='修改时间', verbose_name='修改时间')), + ('is_deleted', models.BooleanField(default=False, help_text='删除标记', verbose_name='删除标记')), + ('fp_hex', models.CharField(max_length=16, unique=True, verbose_name='simhash(hex)')), + ('fp_int', models.BigIntegerField(db_index=True, verbose_name='simhash(int)')), + ('seg1', models.IntegerField(db_index=True)), + ('seg2', models.IntegerField(db_index=True)), + ('seg3', models.IntegerField(db_index=True)), + ('seg4', models.IntegerField(db_index=True)), + ('score', models.FloatField(verbose_name='得分')), + ], + options={ + 'indexes': [models.Index(fields=['seg1'], name='carbon_fing_seg1_a18a6c_idx'), models.Index(fields=['seg2'], name='carbon_fing_seg2_5c4cfa_idx'), models.Index(fields=['seg3'], name='carbon_fing_seg3_87273b_idx'), models.Index(fields=['seg4'], name='carbon_fing_seg4_60f65c_idx')], + }, + ), + ] diff --git a/apps/carbon/models.py b/apps/carbon/models.py index ad64005..4c413b4 100644 --- a/apps/carbon/models.py +++ b/apps/carbon/models.py @@ -1,5 +1,5 @@ from django.db import models -from apps.utils.models import CommonBDModel +from apps.utils.models import CommonBDModel, BaseModel # Create your models here. class Work(CommonBDModel): @@ -28,3 +28,27 @@ class Work(CommonBDModel): dh_file4 = models.ForeignKey("system.file", on_delete=models.SET_NULL, null=True, blank=True, related_name='work_dh_file4') dh_file5 = models.ForeignKey("system.file", on_delete=models.SET_NULL, null=True, blank=True, related_name='work_dh_file5') dh_file6 = models.ForeignKey("system.file", on_delete=models.SET_NULL, null=True, blank=True, related_name='work_dh_file6') + + +class Fingerprint(BaseModel): + fp_hex = models.CharField( + max_length=16, unique=True, verbose_name="simhash(hex)" + ) + fp_int = models.BigIntegerField( + db_index=True, verbose_name="simhash(int)" + ) + + seg1 = models.IntegerField(db_index=True) + seg2 = models.IntegerField(db_index=True) + seg3 = models.IntegerField(db_index=True) + seg4 = models.IntegerField(db_index=True) + + score = models.FloatField(verbose_name="得分") + + class Meta: + indexes = [ + models.Index(fields=["seg1"]), + models.Index(fields=["seg2"]), + models.Index(fields=["seg3"]), + models.Index(fields=["seg4"]), + ] diff --git a/apps/carbon/service.py b/apps/carbon/service.py index 80fe58b..49130e2 100644 --- a/apps/carbon/service.py +++ b/apps/carbon/service.py @@ -59,6 +59,16 @@ def parse_file(file_path:str): def get_fingerprint(text): return Simhash(text).value -# --- 汉明距离 --- -def hamming_distance(a, b): - return bin(a ^ b).count("1") \ No newline at end of file +MASK_64 = (1 << 64) - 1 + +def hamming_distance(a_u, b_s): + return ((a_u ^ (b_s & MASK_64)) & MASK_64).bit_count() + + +def split_simhash(fp_int: int): + return ( + (fp_int >> 48) & 0xffff, + (fp_int >> 32) & 0xffff, + (fp_int >> 16) & 0xffff, + fp_int & 0xffff, + ) \ No newline at end of file diff --git a/apps/carbon/views.py b/apps/carbon/views.py index 8884032..c4d70ee 100644 --- a/apps/carbon/views.py +++ b/apps/carbon/views.py @@ -1,17 +1,18 @@ from django.shortcuts import render -from .models import Work +from .models import Work, Fingerprint from .serializers import WorkSerializer, WorkCreateSerializer, WorkDqCalSerializer, WorkDhCalSerializer from apps.utils.viewsets import CustomModelViewSet from rest_framework.decorators import action import os from django.conf import settings import json -from apps.carbon.service import parse_file +from apps.carbon.service import parse_file, get_fingerprint, hamming_distance, split_simhash import requests from rest_framework.exceptions import ParseError import re from rest_framework.response import Response -from django.db import transaction +from django.db import transaction, IntegrityError +from django.db.models import Q # Create your views here. LLM_URL = "http://106.0.4.200:9000/v1/chat/completions" @@ -48,6 +49,11 @@ def ask(input:str, p_name:str, stream=False): raise ParseError("模型处理错误超过最大token限制") return response.json()["choices"][0]["message"]["content"] +def simhash_to_db(n: int) -> int: + return n if n < (1 << 63) else n - (1 << 64) + +def simhash_from_db(n: int) -> int: + return n if n >= 0 else n + (1 << 64) class WorkViewSet(CustomModelViewSet): queryset = Work.objects.all() @@ -246,14 +252,78 @@ class WorkViewSet(CustomModelViewSet): return Response({"total_score": total_score, "data": data}) @staticmethod - def parse_files(): - pass + def parse_files(work: Work): + contents = [] + filenames = [] + for file in [work.dh_file1, work.dh_file2, work.dh_file3, work.dh_file4, work.dh_file5, work.dh_file6]: + if file: + if file.name in filenames: + continue + path = (settings.BASE_DIR + file.path).replace('\\', '/') + content = parse_file(path) + filenames.append(file.name) + contents.append(content) + return '\n'.join(contents) @action(detail=True, methods=['post'], serializer_class=WorkDhCalSerializer) @transaction.atomic def cal_dh(self, request, pk): work = self.get_object() - + sr = WorkDqCalSerializer(work, data=request.data) + sr.is_valid(raise_exception=True) + sr.save() + work = Work.objects.get(pk=pk) + content = WorkViewSet.parse_files(work) + + fp_u = get_fingerprint(content) # unsigned + fp_int = simhash_to_db(fp_u) # signed for db + fp_hex = format(fp_u, "016x") + + s1, s2, s3, s4 = split_simhash(fp_int) + + # 1️⃣ 分段粗筛 + candidates = ( + Fingerprint.objects + .filter( + Q(seg1=s1) | + Q(seg2=s2) | + Q(seg3=s3) | + Q(seg4=s4) + ) + .only("fp_int", "score") + ) + + # 2️⃣ 精确海明距离 + for obj in candidates: + if hamming_distance(fp_u, obj.fp_int) <= HAMMING_THRESHOLD: + work.score_dh = obj.score + work.save(update_fields=["score_dh"]) + return Response({"total_score": obj.score}) + + # 3️⃣ 未命中 → 调用 AI + res = ask(content, "tec_dh") + score = round(float(res), 2) + + work.score_dh = score + work.save(update_fields=["score_dh"]) + + # 4️⃣ 并发安全写入指纹库 + try: + Fingerprint.objects.create( + fp_hex=fp_hex, + fp_int=fp_int, + seg1=s1, + seg2=s2, + seg3=s3, + seg4=s4, + score=score, + ) + except IntegrityError: + # 并发下已存在,忽略即可 + pass + + return Response({"total_score": score}) +