大模型代码-CSDN博客

本文链接：https://blog.csdn.net/SMF0504/article/details/146983321

rank.py

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import chromadb
from chromadb.config import Settings
from nltk.tokenize import sent_tokenize
import json
from sentence_transformers import CrossEncoder

def extract_text_from_pdf(filename,page_numbers=None,min_line_length=10):
    '''从 PDF 文件中（按指定页码）提取文字'''
    paragraphs = []
    buffer = ''
    full_text = ''
    # 提取全部文本
    for i, page_layout in enumerate(extract_pages(filename)):
        # 如果指定了页码范围，跳过范围外的页
        if page_numbers is not None and i not in page_numbers:
            continue
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                full_text += element.get_text() + '\n'
    # 按空行分隔，将文本重新组织成段落
    lines = full_text.split('\n')
    for text in lines:
        if len(text) &gt;= min_line_length:
            buffer += (' '+text) if not text.endswith('-') else text.strip('-')
        elif buffer: 
            paragraphs.append(buffer)
            buffer = ''
    return paragraphs

paragraphs = extract_text_from_pdf("llama2.pdf",page_numbers=[2,3])
chroma_client = chromadb.Client(Settings(allow_reset=True))

# 为了演示，实际不需要每次 reset()
chroma_client.reset()

# 创建一个 collection
collection = chroma_client.get_or_create_collection(name="demo")

def semantic_search(query, top_n):
    '''检索向量数据库'''
    results = collection.query(
        query_embeddings=[get_embedding(query)],
        n_results=top_n
    )
    return results


def split_text(paragraphs,chunk_size=300,overlap_size=100,min_line_length=10):
    '''按指定 chunk_size 和 overlap_size 交叠割文本'''
    sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
    chunks = []
    i= 0
    while i &lt; len(sentences):
        chunk = sentences[i]
        overlap = ''
        prev_len = 0
        prev = i - 1
        # 向前计算重叠部分
        while prev &gt;= 0 and len(sentences[prev])+len(overlap) &lt;= overlap_size:
            overlap = sentences[prev] + ' ' + overlap
            prev -= 1
        chunk = overlap+chunk
        next = i + 1
        # 向后计算当前chunk
        while next &lt; len(sentences) and len(sentences[next])+len(chunk) &lt;= chunk_size:
            chunk = chunk + ' ' + sentences[next]
            next += 1
        chunks.append(chunk)
        i = next
    return chunks


chunks = split_text(paragraphs,300,100)

# 向 collection 中添加chunk与向量
collection.add(
    embeddings=[get_embedding(chunk) for chunk in chunks],
    documents=chunks,
    metadatas=[{"source": "llama2.pdf"} for _ in chunks],
    ids=[f"ck_{i}" for i in range(len(chunks))]
)

user_query="how safe is llama 2"

search_results = semantic_search(user_query,5)

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
scores = model.predict([(user_query, doc) for doc in search_results['documents'][0]])
# 按得分排序
sorted_list = sorted(zip(scores,search_results['documents'][0]), key=lambda x: x[0], reverse=True)
for score, doc in sorted_list:
    print(f"{score}\t{doc}\n")

chinese_utils.py

import re
import jieba
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')  

def to_keywords(input_string):
    """将句子转成检索关键词序列"""
    # 按搜索引擎模式分词
    word_tokens = jieba.cut_for_search(input_string)
    # 加载停用词表
    stop_words = set(stopwords.words('chinese'))
    # 去除停用词
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

def sent_tokenize(input_string):
    """按标点断句"""
    # 按标点切分
    sentences = re.split(r'(?&lt;=[。！？；?!])', input_string)
    # 去掉空字符串
    return [sentence for sentence in sentences if sentence.strip()]

    
if "__main__" == __name__:
    # 测试关键词提取
    print(to_keywords("小明硕士毕业于中国科学院计算所，后在麻省理工学院深造"))
    # 测试断句
    print(sent_tokenize("这是，第一句。这是第二句吗？是的！啊"))