rongrunxiang 6 месяцев назад
Родитель
Сommit
6e44d520ab

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+/data
+/.idea
+/__pycache__

+ 36 - 0
LLM.py

@@ -0,0 +1,36 @@
+# coding=gbk
+from langchain.llms.base import LLM
+from typing import Any, List, Optional
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+
+class ChatGLM_LLM(LLM):
+    # 基于本地 InternLM 自定义 LLM 类
+    tokenizer: AutoTokenizer = None
+    model: AutoModelForCausalLM = None
+
+    def __init__(self, model_path: str):
+        # model_path: InternLM 模型路径
+        # 从本地初始化模型
+        super().__init__()
+        print("正在从本地加载模型...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cuda(
+            device=1)
+        self.model = self.model.eval()
+        print("完成本地模型的加载")
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None,
+              run_manager: Optional[CallbackManagerForLLMRun] = None,
+              **kwargs: Any):
+        # 重写调用函数
+        response, history = self.model.chat(self.tokenizer, prompt, history=[], do_sample=False)
+        return response
+
+
+
+    @property
+    def _llm_type(self) -> str:
+        return "ChatGLM3-6B"

+ 0 - 2
README.md

@@ -1,2 +0,0 @@
-# why10w
-

+ 54 - 0
answerQuestions.py

@@ -0,0 +1,54 @@
+# coding=gbk
+from langchain.vectorstores import Chroma
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+import os
+
+# 定义 Embeddings
+embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
+
+
+# 向量数据库持久化路径
+persist_directory = 'data_base/vector_db/chroma'
+
+# 加载数据库
+vectordb = Chroma(
+    persist_directory=persist_directory,
+    embedding_function=embeddings
+)
+
+from LLM import ChatGLM_LLM
+modelPath = "/mnt/sdb/zhaoyuan/rongrunxiang/glm-4-9b-chat"
+llm = ChatGLM_LLM(model_path = modelPath)
+llm.predict("你是谁")
+
+from langchain.prompts import PromptTemplate
+
+# 我们所构造的 Prompt 模板
+template = """根据用户的问题,从提供的上下文中找到相应的知识,并据此创作一个适合于3-6岁小孩的绘本文本。尽量生动有趣。
+{context}
+问题: {question}
+文本:"""
+
+# 调用 LangChain 的方法来实例化一个 Template 对象,该对象包含了 context 和 question 两个变量,在实际调用时,这两个变量会被检索到的文档片段和用户提问填充
+QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question"],template=template)
+
+from langchain.chains import RetrievalQA
+
+qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectordb.as_retriever(),return_source_documents=True,chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})
+
+# 检索问答链回答效果
+question = "什么是哺乳动物"
+result = qa_chain({"query": question})
+print("检索问答链回答 question 的结果:")
+print(result["result"])
+
+if "source_documents" in result:
+    print("\n找到的文档片段:")
+    for doc in result["source_documents"]:
+        print(doc)  # 这里假设文档片段是直接可打印的,可能需要根据实际情况调整格式
+
+
+# 仅 LLM 回答效果
+result_2 = llm(question)
+print("大模型回答 question 的结果:")
+print(result_2)

+ 116 - 0
buildVectors2.py

@@ -0,0 +1,116 @@
+# coding=utf-8
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.document_loaders import UnstructuredMarkdownLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import MathpixPDFLoader
+from langchain_community.document_loaders import UnstructuredPDFLoader
+from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain_community.document_loaders.csv_loader import CSVLoader
+
+from langchain.text_splitter import MarkdownHeaderTextSplitter
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer, util
+import os
+import chardet
+
+import erniebot
+import numpy as np
+from langchain_embedding_ErnieBotSDK import ErnieEmbeddings
+
+erniebot.api_type = "aistudio"
+erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
+
+embeddings=ErnieEmbeddings(access_token="ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f", chunk_size=1)
+
+# 获取文件路径函数
+def get_files(dir_path):
+    # args:dir_path,目标文件夹路径
+    file_list = []
+    for filepath, dirnames, filenames in os.walk(dir_path):
+        # os.walk 函数将递归遍历指定文件夹
+        for filename in filenames:
+            # 通过后缀名判断文件类型是否满足要求
+            if filename.endswith(".md"):
+                # 如果满足要求,将其绝对路径加入到结果列表
+                file_list.append(os.path.join(filepath, filename))
+            elif filename.endswith(".txt"):
+                file_list.append(os.path.join(filepath, filename))
+            #elif filename.endswith(".pdf"):
+                #file_list.append(os.path.join(filepath, filename))
+            elif filename.endswith(".docx"):
+                file_list.append(os.path.join(filepath, filename))
+            elif filename.endswith(".csv"):
+                file_list.append(os.path.join(filepath, filename))
+
+    return file_list
+
+
+# 加载文件函数
+def get_text(dir_path):
+    file_lst = get_files(dir_path)
+    docs = []
+    for one_file in tqdm(file_lst):
+        file_type = one_file.split('.')[-1]
+
+        # 尝试检测文件编码
+        with open(one_file, 'rb') as f:
+            rawdata = f.read()
+        encoding = chardet.detect(rawdata)['encoding']
+        print(f"Detected encoding for {one_file}: {encoding}")
+
+        # 根据文件类型创建适当的加载器,并指定编码
+        if file_type == 'md':
+            loader = UnstructuredMarkdownLoader(one_file, encoding=encoding)
+        elif file_type == 'txt':
+            loader = UnstructuredFileLoader(one_file, encoding=encoding)
+        #elif file_type == 'pdf':
+            # loader = PyPDFLoader(one_file)
+            # loader = MathpixPDFLoader(one_file)
+            #loader = UnstructuredPDFLoader(one_file)
+        elif file_type == 'docx':
+            loader = Docx2txtLoader(one_file, encoding=encoding)
+        elif file_type == 'csv':
+            loader = CSVLoader(one_file, encoding=encoding)
+        else:
+            continue
+
+        # 加载文档
+        try:
+            docs.extend(loader.load())
+        except UnicodeDecodeError as e:
+            print(f"Failed to load {one_file} due to encoding error: {str(e)}")
+            continue
+
+    return docs
+
+
+# 目标文件夹
+tar_dir = [
+    "files/",
+]
+
+# 加载目标文件
+docs = []
+for dir_path in tar_dir:
+    print(get_text(dir_path))
+    docs.extend(get_text(dir_path))
+
+print(docs)
+
+#embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
+
+# 构建向量数据库
+# 定义持久化路径
+persist_directory = 'data_base/vector_db/chroma'
+# 加载数据库
+vectordb = Chroma.from_documents(
+    documents=docs,
+    embedding=embeddings,
+    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
+)
+# 将加载的向量数据库持久化到磁盘上
+vectordb.persist()

+ 128 - 0
buildVectors3.py

@@ -0,0 +1,128 @@
+# coding=utf-8
+import chromadb
+import pandas as pd
+from langchain.document_loaders import UnstructuredFileLoader
+from langchain.document_loaders import UnstructuredMarkdownLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import MathpixPDFLoader
+from langchain_community.document_loaders import UnstructuredPDFLoader
+from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain_community.document_loaders.csv_loader import CSVLoader
+
+from langchain.text_splitter import MarkdownHeaderTextSplitter
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer, util
+import os
+import chardet
+
+import erniebot
+import numpy as np
+from langchain_embedding_ErnieBotSDK import ErnieEmbeddings
+
+erniebot.api_type = "aistudio"
+erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
+
+embeddings=ErnieEmbeddings(access_token="ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f", chunk_size=1)
+response = erniebot.Embedding.create(
+    model='ernie-text-embedding',
+    input=[
+        "我是百度公司开发的人工智能语言模型,我的中文名是文心一言,英文名是ERNIE-Bot。",
+        "2018年深圳市各区GDP"
+    ])
+
+for embedding in response.get_result():
+    embedding = np.array(embedding)
+    print(embedding)
+
+chroma_client = chromadb.PersistentClient(path="data/chroma")
+collection = chroma_client.create_collection(name="collection")
+
+# 获取文件路径函数
+def get_files(dir_path):
+    # args:dir_path,目标文件夹路径
+    file_list = []
+    for filepath, dirnames, filenames in os.walk(dir_path):
+        # os.walk 函数将递归遍历指定文件夹
+        for filename in filenames:
+            # 通过后缀名判断文件类型是否满足要求
+            if filename.endswith(".csv"):
+                file_list.append(os.path.join(filepath, filename))
+
+    return file_list
+
+
+# 加载文件函数
+def get_text(dir_path):
+    file_lst = get_files(dir_path)
+    docs = []
+    metadatas = []
+    ids = []
+    embeddings = []
+
+    for one_file in tqdm(file_lst):
+        file_type = one_file.split('.')[-1]
+
+        # 尝试检测文件编码
+        with open(one_file, 'rb') as f:
+            rawdata = f.read()
+        encoding = chardet.detect(rawdata)['encoding']
+        print(f"Detected encoding for {one_file}: {encoding}")
+
+        if file_type == 'csv':
+            df = pd.read_csv(one_file)
+            for index, row in df.iterrows():
+                output_str = ""
+                text = row[1]
+                text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
+                all_splits = text_splitter.split_text(text)
+                for split_text in all_splits:
+                    split_text = row[0] + " " + split_text
+                my_list = []
+                my_list.append(output_str)
+
+                response = erniebot.Embedding.create(
+                    model='ernie-text-embedding',
+                    input=my_list)
+                embeddings.append(response.data[0].embedding)
+                docs.append(output_str)
+                metadatas.append({"source": one_file.split(".")[0]})
+                ids.append(f"id{index}")
+
+
+        else:
+            continue
+
+
+    return docs
+
+
+# 目标文件夹
+tar_dir = [
+    "files/",
+]
+
+# 加载目标文件
+docs = []
+for dir_path in tar_dir:
+    print(get_text(dir_path))
+    docs.extend(get_text(dir_path))
+
+print(docs)
+
+#embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
+
+# 构建向量数据库
+# 定义持久化路径
+persist_directory = 'data_base/vector_db/chroma'
+# 加载数据库
+vectordb = Chroma.from_documents(
+    documents=docs,
+    embedding=embeddings,
+    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
+)
+# 将加载的向量数据库持久化到磁盘上
+vectordb.persist()

Разница между файлами не показана из-за своего большого размера
+ 1 - 0
file_not_used/古生物.csv


Разница между файлами не показана из-за своего большого размера
+ 1 - 0
file_not_used/天文.csv


Разница между файлами не показана из-за своего большого размера
+ 6 - 0
file_not_used/植物.csv


Разница между файлами не показана из-за своего большого размера
+ 2 - 0
file_not_used/海洋.csv


Разница между файлами не показана из-за своего большого размера
+ 1 - 0
file_not_used/生命.csv


Разница между файлами не показана из-за своего большого размера
+ 3 - 0
file_not_used/航天与航空.csv


Разница между файлами не показана из-за своего большого размера
+ 1 - 0
files/动物.csv


+ 179 - 0
langchain_embedding_ErnieBotSDK.py

@@ -0,0 +1,179 @@
+import asyncio
+import logging
+import threading
+from functools import partial
+from typing import Dict, List, Optional
+
+import requests
+
+from langchain.pydantic_v1 import BaseModel, root_validator
+from langchain.schema.embeddings import Embeddings
+from langchain.utils import get_from_dict_or_env
+import erniebot
+import numpy as np
+import time
+import os
+## 注意不要用翻墙
+## https://python.langchain.com/docs/integrations/chat/ernie
+
+logger = logging.getLogger(__name__)
+
+
+class ErnieEmbeddings(BaseModel, Embeddings):
+    """`Ernie Embeddings V1` embedding models."""
+
+    ernie_api_base: Optional[str] = None
+    ernie_client_id: Optional[str] = None
+    ernie_client_secret: Optional[str] = None
+    access_token: Optional[str] = None#erniebot.access_token = '<access-token-for-aistudio>'
+    
+    chunk_size: int = 16
+
+    model_name = "ErnieBot-Embedding-V1"
+
+    _lock = threading.Lock()
+    '''
+    kevin modify:
+    '''
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        # values["ernie_api_base"] = get_from_dict_or_env(
+        #     values, "ernie_api_base", "ERNIE_API_BASE", "https://aip.baidubce.com"
+        # )
+        values["access_token"] = get_from_dict_or_env(
+            values,
+            "access_token",
+            "ACCESS_TOKEN",
+        )
+        values["api_type"] = 'aistudio'
+        
+        erniebot.api_type = values["api_type"]
+        erniebot.access_token = values["access_token"]
+        return values
+
+    # def _embedding(self, json: object) -> dict:
+        # base_url = (
+        #     f"{self.ernie_api_base}/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings"
+        # )
+        # resp = requests.post(
+        #     f"{base_url}/embedding-v1",
+        #     headers={
+        #         "Content-Type": "application/json",
+        #     },
+        #     params={"access_token": self.access_token},
+        #     json=json,
+        # )
+        # return resp.json()
+    '''
+    kevin modify:
+    '''
+    def _embedding(self, json: object) -> dict:
+        inputs=json['input']
+        def erniebotSDK(inputs):
+            response = erniebot.Embedding.create(
+                model='ernie-text-embedding',
+                input=inputs)
+            time.sleep(1)
+            return response
+        try:
+            response=erniebotSDK(inputs)
+        except:
+            print('connect erniebot error...wait 2s to retry(kevin)')
+            time.sleep(2)
+            response=erniebotSDK(inputs)
+        return response
+    
+    def _refresh_access_token_with_lock(self) -> None:
+        with self._lock:
+            logger.debug("Refreshing access token")
+            base_url: str = f"{self.ernie_api_base}/oauth/2.0/token"
+            resp = requests.post(
+                base_url,
+                headers={
+                    "Content-Type": "application/json",
+                    "Accept": "application/json",
+                },
+                params={
+                    "grant_type": "client_credentials",
+                    "client_id": self.ernie_client_id,
+                    "client_secret": self.ernie_client_secret,
+                },
+            )
+            self.access_token = str(resp.json().get("access_token"))
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed search docs.
+
+        Args:
+            texts: The list of texts to embed
+
+        Returns:
+            List[List[float]]: List of embeddings, one for each text.
+        """
+
+        if not self.access_token:
+            self._refresh_access_token_with_lock()
+        text_in_chunks = [
+            texts[i : i + self.chunk_size]
+            for i in range(0, len(texts), self.chunk_size)
+        ]
+        lst = []
+        for chunk in text_in_chunks:
+            resp = self._embedding({"input": [text for text in chunk]})
+            if resp.get("error_code"):
+                if resp.get("error_code") == 111:
+                    self._refresh_access_token_with_lock()
+                    resp = self._embedding({"input": [text for text in chunk]})
+                else:
+                    raise ValueError(f"Error from Ernie: {resp}")
+            lst.extend([i["embedding"] for i in resp["data"]])
+        return lst
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query text.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            List[float]: Embeddings for the text.
+        """
+
+        if not self.access_token:
+            self._refresh_access_token_with_lock()
+        resp = self._embedding({"input": [text]})
+        if resp.get("error_code"):
+            if resp.get("error_code") == 111:
+                self._refresh_access_token_with_lock()
+                resp = self._embedding({"input": [text]})
+            else:
+                raise ValueError(f"Error from Ernie: {resp}")
+        return resp["data"][0]["embedding"]
+
+    async def aembed_query(self, text: str) -> List[float]:
+        """Asynchronous Embed query text.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            List[float]: Embeddings for the text.
+        """
+
+        return await asyncio.get_running_loop().run_in_executor(
+            None, partial(self.embed_query, text)
+        )
+
+    async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Asynchronous Embed search docs.
+
+        Args:
+            texts: The list of texts to embed
+
+        Returns:
+            List[List[float]]: List of embeddings, one for each text.
+        """
+
+        result = await asyncio.gather(*[self.aembed_query(text) for text in texts])
+
+        return list(result)

Разница между файлами не показана из-за своего большого размера
+ 530345 - 0
test.ipynb


Разница между файлами не показана из-за своего большого размера
+ 27 - 0
test.py



Некоторые файлы не были показаны из-за большого количества измененных файлов