rongrunxiang 6 maanden geleden
bovenliggende
commit
fdb27daccc
7 gewijzigde bestanden met toevoegingen van 107 en 535 verwijderingen
  1. 0 36
      LLM.py
  2. 32 49
      answerQuestions.py
  3. 75 0
      buildVectors.py
  4. 0 116
      buildVectors2.py
  5. 0 128
      buildVectors3.py
  6. 0 179
      langchain_embedding_ErnieBotSDK.py
  7. 0 27
      test.py

+ 0 - 36
LLM.py

@@ -1,36 +0,0 @@
-# coding=gbk
-from langchain.llms.base import LLM
-from typing import Any, List, Optional
-from langchain.callbacks.manager import CallbackManagerForLLMRun
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-
-
-class ChatGLM_LLM(LLM):
-    # 基于本地 InternLM 自定义 LLM 类
-    tokenizer: AutoTokenizer = None
-    model: AutoModelForCausalLM = None
-
-    def __init__(self, model_path: str):
-        # model_path: InternLM 模型路径
-        # 从本地初始化模型
-        super().__init__()
-        print("正在从本地加载模型...")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cuda(
-            device=1)
-        self.model = self.model.eval()
-        print("完成本地模型的加载")
-
-    def _call(self, prompt: str, stop: Optional[List[str]] = None,
-              run_manager: Optional[CallbackManagerForLLMRun] = None,
-              **kwargs: Any):
-        # 重写调用函数
-        response, history = self.model.chat(self.tokenizer, prompt, history=[], do_sample=False)
-        return response
-
-
-
-    @property
-    def _llm_type(self) -> str:
-        return "ChatGLM3-6B"

+ 32 - 49
answerQuestions.py

@@ -1,54 +1,37 @@
-# coding=gbk
-from langchain.vectorstores import Chroma
-from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+import pandas as pd
+import chromadb
 import os
-
-# 定义 Embeddings
-embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
-
-
-# 向量数据库持久化路径
-persist_directory = 'data_base/vector_db/chroma'
-
-# 加载数据库
-vectordb = Chroma(
-    persist_directory=persist_directory,
-    embedding_function=embeddings
+import erniebot
+erniebot.api_type = "aistudio"
+erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
+chroma_client = chromadb.PersistentClient(path="data/chroma")
+
+vectordb = chroma_client.get_collection(name="collection")
+
+question = "浠€涔堟槸鍏嬮殕缇�"
+query = [question,]
+response = erniebot.Embedding.create(
+    input=query,
+    model='ernie-text-embedding',
 )
+query_embeddings = response.data[0]['embedding']
+print(query_embeddings)
 
-from LLM import ChatGLM_LLM
-modelPath = "/mnt/sdb/zhaoyuan/rongrunxiang/glm-4-9b-chat"
-llm = ChatGLM_LLM(model_path = modelPath)
-llm.predict("你是谁")
-
-from langchain.prompts import PromptTemplate
-
-# 我们所构造的 Prompt 模板
-template = """根据用户的问题,从提供的上下文中找到相应的知识,并据此创作一个适合于3-6岁小孩的绘本文本。尽量生动有趣。
-{context}
-问题: {question}
-文本:"""
-
-# 调用 LangChain 的方法来实例化一个 Template 对象,该对象包含了 context 和 question 两个变量,在实际调用时,这两个变量会被检索到的文档片段和用户提问填充
-QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question"],template=template)
-
-from langchain.chains import RetrievalQA
-
-qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectordb.as_retriever(),return_source_documents=True,chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})
-
-# 检索问答链回答效果
-question = "什么是哺乳动物"
-result = qa_chain({"query": question})
-print("检索问答链回答 question 的结果:")
-print(result["result"])
-
-if "source_documents" in result:
-    print("\n找到的文档片段:")
-    for doc in result["source_documents"]:
-        print(doc)  # 这里假设文档片段是直接可打印的,可能需要根据实际情况调整格式
+results = vectordb.query(
+    query_embeddings = query_embeddings,
+    n_results=1 #top_k
+)
 
+search_result = {'metadatas': results['metadatas'][0], 'documents': results['documents'][0]}
+prompt = f"浣犳槸涓€涓�搮闀跨粰灏忔湅鍙嬭�鏁呬簨鐨勫皬鍔╂墜锛岃�鏍规嵁鑷�繁鐨勭煡璇嗗拰妫€绱㈠埌鐨勫唴瀹癸紝瀵瑰皬鏈嬪弸鐨勭枒闂�繘琛岃В绛斻€傛敞鎰忥紝鐢变簬灏忔湅鍙嬬殑骞撮緞杈冨皬锛岄棶棰樿В绛旈渶瑕佺敤鏁呬簨鐨勫舰寮忋€傚敖閲忎娇鏁呬簨鐢熷姩銆佹湁瓒c€佸瘜鏈夊摬鐞嗐€� \n 鐢ㄦ埛闂��: {question} \n\n 鎼滅储缁撴灉:\n {search_result}"
+print(prompt)
 
-# 仅 LLM 回答效果
-result_2 = llm(question)
-print("大模型回答 question 的结果:")
-print(result_2)
+messages = [
+    {"role": "user", "content": prompt}
+]
+response = erniebot.ChatCompletion.create(
+    model="ernie-4.0",
+    messages=messages
+)
+story = response.get_result()
+print(story)

+ 75 - 0
buildVectors.py

@@ -0,0 +1,75 @@
+import pandas as pd
+import chromadb
+import os
+import erniebot
+erniebot.api_type = "aistudio"
+erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
+
+chroma_client = chromadb.PersistentClient(path="data/chroma")
+collection = chroma_client.create_collection(name="collection")
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+def get_files(dir_path):
+    # args:dir_path,目标文件夹路径
+    file_list = []
+    for filepath, dirnames, filenames in os.walk(dir_path):
+        # os.walk 函数将递归遍历指定文件夹
+        for filename in filenames:
+            # 通过后缀名判断文件类型是否满足要求
+            if filename.endswith(".csv"):
+                file_list.append(os.path.join(filepath, filename))
+
+    return file_list
+
+tar_dir = "files/"
+
+file_list = get_files(tar_dir)
+for file in file_list:
+    df = pd.read_csv(file)
+    print(df.head())
+    print(file.split("/")[-1].split(".")[0])
+    print("---")
+
+file_list = get_files(tar_dir)
+my_id = 1
+
+for file in file_list:
+    print(file)
+    docs = []
+    metadatas = []
+    ids = []
+    embeddings = []
+    books = []
+    df = pd.read_csv(file)
+    book_name = file.split("/")[-1].split(".")[0]
+
+    for index, row in df.iterrows():
+        title = row.iloc[0]
+        text = row.iloc[1]
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=0)
+        all_splits = text_splitter.split_text(text)
+        all_splits = ["title: " + title + " content:" + s for s in all_splits]
+
+        try:
+            response = erniebot.Embedding.create(
+                model='ernie-text-embedding',
+                input=all_splits)
+        except Exception as e:
+            print(all_splits)
+            print(e)
+            continue
+        for i in range(len(all_splits)):
+            docs.append(text)
+            metadatas.append({"book": book_name, "title": title})
+            books.append(book_name)
+            embeddings.append(response.data[i]['embedding'])
+            ids.append(f"id{my_id}")
+            my_id += 1
+
+        collection.add(documents=docs,
+                       metadatas=metadatas,
+                       ids=ids,
+                       embeddings=embeddings)
+
+print("Number of vectors in vectordb: ", collection.count())

+ 0 - 116
buildVectors2.py

@@ -1,116 +0,0 @@
-# coding=utf-8
-from langchain.document_loaders import UnstructuredFileLoader
-from langchain.document_loaders import UnstructuredMarkdownLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import Chroma
-from langchain.embeddings.huggingface import HuggingFaceEmbeddings
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.document_loaders import MathpixPDFLoader
-from langchain_community.document_loaders import UnstructuredPDFLoader
-from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
-from langchain_community.document_loaders import Docx2txtLoader
-from langchain_community.document_loaders.csv_loader import CSVLoader
-
-from langchain.text_splitter import MarkdownHeaderTextSplitter
-from tqdm import tqdm
-from sentence_transformers import SentenceTransformer, util
-import os
-import chardet
-
-import erniebot
-import numpy as np
-from langchain_embedding_ErnieBotSDK import ErnieEmbeddings
-
-erniebot.api_type = "aistudio"
-erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
-
-embeddings=ErnieEmbeddings(access_token="ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f", chunk_size=1)
-
-# 获取文件路径函数
-def get_files(dir_path):
-    # args:dir_path,目标文件夹路径
-    file_list = []
-    for filepath, dirnames, filenames in os.walk(dir_path):
-        # os.walk 函数将递归遍历指定文件夹
-        for filename in filenames:
-            # 通过后缀名判断文件类型是否满足要求
-            if filename.endswith(".md"):
-                # 如果满足要求,将其绝对路径加入到结果列表
-                file_list.append(os.path.join(filepath, filename))
-            elif filename.endswith(".txt"):
-                file_list.append(os.path.join(filepath, filename))
-            #elif filename.endswith(".pdf"):
-                #file_list.append(os.path.join(filepath, filename))
-            elif filename.endswith(".docx"):
-                file_list.append(os.path.join(filepath, filename))
-            elif filename.endswith(".csv"):
-                file_list.append(os.path.join(filepath, filename))
-
-    return file_list
-
-
-# 加载文件函数
-def get_text(dir_path):
-    file_lst = get_files(dir_path)
-    docs = []
-    for one_file in tqdm(file_lst):
-        file_type = one_file.split('.')[-1]
-
-        # 尝试检测文件编码
-        with open(one_file, 'rb') as f:
-            rawdata = f.read()
-        encoding = chardet.detect(rawdata)['encoding']
-        print(f"Detected encoding for {one_file}: {encoding}")
-
-        # 根据文件类型创建适当的加载器,并指定编码
-        if file_type == 'md':
-            loader = UnstructuredMarkdownLoader(one_file, encoding=encoding)
-        elif file_type == 'txt':
-            loader = UnstructuredFileLoader(one_file, encoding=encoding)
-        #elif file_type == 'pdf':
-            # loader = PyPDFLoader(one_file)
-            # loader = MathpixPDFLoader(one_file)
-            #loader = UnstructuredPDFLoader(one_file)
-        elif file_type == 'docx':
-            loader = Docx2txtLoader(one_file, encoding=encoding)
-        elif file_type == 'csv':
-            loader = CSVLoader(one_file, encoding=encoding)
-        else:
-            continue
-
-        # 加载文档
-        try:
-            docs.extend(loader.load())
-        except UnicodeDecodeError as e:
-            print(f"Failed to load {one_file} due to encoding error: {str(e)}")
-            continue
-
-    return docs
-
-
-# 目标文件夹
-tar_dir = [
-    "files/",
-]
-
-# 加载目标文件
-docs = []
-for dir_path in tar_dir:
-    print(get_text(dir_path))
-    docs.extend(get_text(dir_path))
-
-print(docs)
-
-#embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
-
-# 构建向量数据库
-# 定义持久化路径
-persist_directory = 'data_base/vector_db/chroma'
-# 加载数据库
-vectordb = Chroma.from_documents(
-    documents=docs,
-    embedding=embeddings,
-    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
-)
-# 将加载的向量数据库持久化到磁盘上
-vectordb.persist()

+ 0 - 128
buildVectors3.py

@@ -1,128 +0,0 @@
-# coding=utf-8
-import chromadb
-import pandas as pd
-from langchain.document_loaders import UnstructuredFileLoader
-from langchain.document_loaders import UnstructuredMarkdownLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import Chroma
-from langchain.embeddings.huggingface import HuggingFaceEmbeddings
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.document_loaders import MathpixPDFLoader
-from langchain_community.document_loaders import UnstructuredPDFLoader
-from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
-from langchain_community.document_loaders import Docx2txtLoader
-from langchain_community.document_loaders.csv_loader import CSVLoader
-
-from langchain.text_splitter import MarkdownHeaderTextSplitter
-from tqdm import tqdm
-from sentence_transformers import SentenceTransformer, util
-import os
-import chardet
-
-import erniebot
-import numpy as np
-from langchain_embedding_ErnieBotSDK import ErnieEmbeddings
-
-erniebot.api_type = "aistudio"
-erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
-
-embeddings=ErnieEmbeddings(access_token="ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f", chunk_size=1)
-response = erniebot.Embedding.create(
-    model='ernie-text-embedding',
-    input=[
-        "我是百度公司开发的人工智能语言模型,我的中文名是文心一言,英文名是ERNIE-Bot。",
-        "2018年深圳市各区GDP"
-    ])
-
-for embedding in response.get_result():
-    embedding = np.array(embedding)
-    print(embedding)
-
-chroma_client = chromadb.PersistentClient(path="data/chroma")
-collection = chroma_client.create_collection(name="collection")
-
-# 获取文件路径函数
-def get_files(dir_path):
-    # args:dir_path,目标文件夹路径
-    file_list = []
-    for filepath, dirnames, filenames in os.walk(dir_path):
-        # os.walk 函数将递归遍历指定文件夹
-        for filename in filenames:
-            # 通过后缀名判断文件类型是否满足要求
-            if filename.endswith(".csv"):
-                file_list.append(os.path.join(filepath, filename))
-
-    return file_list
-
-
-# 加载文件函数
-def get_text(dir_path):
-    file_lst = get_files(dir_path)
-    docs = []
-    metadatas = []
-    ids = []
-    embeddings = []
-
-    for one_file in tqdm(file_lst):
-        file_type = one_file.split('.')[-1]
-
-        # 尝试检测文件编码
-        with open(one_file, 'rb') as f:
-            rawdata = f.read()
-        encoding = chardet.detect(rawdata)['encoding']
-        print(f"Detected encoding for {one_file}: {encoding}")
-
-        if file_type == 'csv':
-            df = pd.read_csv(one_file)
-            for index, row in df.iterrows():
-                output_str = ""
-                text = row[1]
-                text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
-                all_splits = text_splitter.split_text(text)
-                for split_text in all_splits:
-                    split_text = row[0] + " " + split_text
-                my_list = []
-                my_list.append(output_str)
-
-                response = erniebot.Embedding.create(
-                    model='ernie-text-embedding',
-                    input=my_list)
-                embeddings.append(response.data[0].embedding)
-                docs.append(output_str)
-                metadatas.append({"source": one_file.split(".")[0]})
-                ids.append(f"id{index}")
-
-
-        else:
-            continue
-
-
-    return docs
-
-
-# 目标文件夹
-tar_dir = [
-    "files/",
-]
-
-# 加载目标文件
-docs = []
-for dir_path in tar_dir:
-    print(get_text(dir_path))
-    docs.extend(get_text(dir_path))
-
-print(docs)
-
-#embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
-
-# 构建向量数据库
-# 定义持久化路径
-persist_directory = 'data_base/vector_db/chroma'
-# 加载数据库
-vectordb = Chroma.from_documents(
-    documents=docs,
-    embedding=embeddings,
-    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
-)
-# 将加载的向量数据库持久化到磁盘上
-vectordb.persist()

+ 0 - 179
langchain_embedding_ErnieBotSDK.py

@@ -1,179 +0,0 @@
-import asyncio
-import logging
-import threading
-from functools import partial
-from typing import Dict, List, Optional
-
-import requests
-
-from langchain.pydantic_v1 import BaseModel, root_validator
-from langchain.schema.embeddings import Embeddings
-from langchain.utils import get_from_dict_or_env
-import erniebot
-import numpy as np
-import time
-import os
-## 注意不要用翻墙
-## https://python.langchain.com/docs/integrations/chat/ernie
-
-logger = logging.getLogger(__name__)
-
-
-class ErnieEmbeddings(BaseModel, Embeddings):
-    """`Ernie Embeddings V1` embedding models."""
-
-    ernie_api_base: Optional[str] = None
-    ernie_client_id: Optional[str] = None
-    ernie_client_secret: Optional[str] = None
-    access_token: Optional[str] = None#erniebot.access_token = '<access-token-for-aistudio>'
-    
-    chunk_size: int = 16
-
-    model_name = "ErnieBot-Embedding-V1"
-
-    _lock = threading.Lock()
-    '''
-    kevin modify:
-    '''
-    @root_validator()
-    def validate_environment(cls, values: Dict) -> Dict:
-        # values["ernie_api_base"] = get_from_dict_or_env(
-        #     values, "ernie_api_base", "ERNIE_API_BASE", "https://aip.baidubce.com"
-        # )
-        values["access_token"] = get_from_dict_or_env(
-            values,
-            "access_token",
-            "ACCESS_TOKEN",
-        )
-        values["api_type"] = 'aistudio'
-        
-        erniebot.api_type = values["api_type"]
-        erniebot.access_token = values["access_token"]
-        return values
-
-    # def _embedding(self, json: object) -> dict:
-        # base_url = (
-        #     f"{self.ernie_api_base}/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings"
-        # )
-        # resp = requests.post(
-        #     f"{base_url}/embedding-v1",
-        #     headers={
-        #         "Content-Type": "application/json",
-        #     },
-        #     params={"access_token": self.access_token},
-        #     json=json,
-        # )
-        # return resp.json()
-    '''
-    kevin modify:
-    '''
-    def _embedding(self, json: object) -> dict:
-        inputs=json['input']
-        def erniebotSDK(inputs):
-            response = erniebot.Embedding.create(
-                model='ernie-text-embedding',
-                input=inputs)
-            time.sleep(1)
-            return response
-        try:
-            response=erniebotSDK(inputs)
-        except:
-            print('connect erniebot error...wait 2s to retry(kevin)')
-            time.sleep(2)
-            response=erniebotSDK(inputs)
-        return response
-    
-    def _refresh_access_token_with_lock(self) -> None:
-        with self._lock:
-            logger.debug("Refreshing access token")
-            base_url: str = f"{self.ernie_api_base}/oauth/2.0/token"
-            resp = requests.post(
-                base_url,
-                headers={
-                    "Content-Type": "application/json",
-                    "Accept": "application/json",
-                },
-                params={
-                    "grant_type": "client_credentials",
-                    "client_id": self.ernie_client_id,
-                    "client_secret": self.ernie_client_secret,
-                },
-            )
-            self.access_token = str(resp.json().get("access_token"))
-
-    def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        """Embed search docs.
-
-        Args:
-            texts: The list of texts to embed
-
-        Returns:
-            List[List[float]]: List of embeddings, one for each text.
-        """
-
-        if not self.access_token:
-            self._refresh_access_token_with_lock()
-        text_in_chunks = [
-            texts[i : i + self.chunk_size]
-            for i in range(0, len(texts), self.chunk_size)
-        ]
-        lst = []
-        for chunk in text_in_chunks:
-            resp = self._embedding({"input": [text for text in chunk]})
-            if resp.get("error_code"):
-                if resp.get("error_code") == 111:
-                    self._refresh_access_token_with_lock()
-                    resp = self._embedding({"input": [text for text in chunk]})
-                else:
-                    raise ValueError(f"Error from Ernie: {resp}")
-            lst.extend([i["embedding"] for i in resp["data"]])
-        return lst
-
-    def embed_query(self, text: str) -> List[float]:
-        """Embed query text.
-
-        Args:
-            text: The text to embed.
-
-        Returns:
-            List[float]: Embeddings for the text.
-        """
-
-        if not self.access_token:
-            self._refresh_access_token_with_lock()
-        resp = self._embedding({"input": [text]})
-        if resp.get("error_code"):
-            if resp.get("error_code") == 111:
-                self._refresh_access_token_with_lock()
-                resp = self._embedding({"input": [text]})
-            else:
-                raise ValueError(f"Error from Ernie: {resp}")
-        return resp["data"][0]["embedding"]
-
-    async def aembed_query(self, text: str) -> List[float]:
-        """Asynchronous Embed query text.
-
-        Args:
-            text: The text to embed.
-
-        Returns:
-            List[float]: Embeddings for the text.
-        """
-
-        return await asyncio.get_running_loop().run_in_executor(
-            None, partial(self.embed_query, text)
-        )
-
-    async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
-        """Asynchronous Embed search docs.
-
-        Args:
-            texts: The list of texts to embed
-
-        Returns:
-            List[List[float]]: List of embeddings, one for each text.
-        """
-
-        result = await asyncio.gather(*[self.aembed_query(text) for text in texts])
-
-        return list(result)

File diff suppressed because it is too large
+ 0 - 27
test.py


Some files were not shown because too many files changed in this diff