123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- # coding=utf-8
- import chromadb
- import pandas as pd
- from langchain.document_loaders import UnstructuredFileLoader
- from langchain.document_loaders import UnstructuredMarkdownLoader
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- from langchain.vectorstores import Chroma
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
- from langchain_community.document_loaders import PyPDFLoader
- from langchain_community.document_loaders import MathpixPDFLoader
- from langchain_community.document_loaders import UnstructuredPDFLoader
- from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
- from langchain_community.document_loaders import Docx2txtLoader
- from langchain_community.document_loaders.csv_loader import CSVLoader
- from langchain.text_splitter import MarkdownHeaderTextSplitter
- from tqdm import tqdm
- from sentence_transformers import SentenceTransformer, util
- import os
- import chardet
- import erniebot
- import numpy as np
- from langchain_embedding_ErnieBotSDK import ErnieEmbeddings
- erniebot.api_type = "aistudio"
- erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f"
- embeddings=ErnieEmbeddings(access_token="ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f", chunk_size=1)
- response = erniebot.Embedding.create(
- model='ernie-text-embedding',
- input=[
- "我是百度公司开发的人工智能语言模型,我的中文名是文心一言,英文名是ERNIE-Bot。",
- "2018年深圳市各区GDP"
- ])
- for embedding in response.get_result():
- embedding = np.array(embedding)
- print(embedding)
- chroma_client = chromadb.PersistentClient(path="data/chroma")
- collection = chroma_client.create_collection(name="collection")
- # 获取文件路径函数
- def get_files(dir_path):
- # args:dir_path,目标文件夹路径
- file_list = []
- for filepath, dirnames, filenames in os.walk(dir_path):
- # os.walk 函数将递归遍历指定文件夹
- for filename in filenames:
- # 通过后缀名判断文件类型是否满足要求
- if filename.endswith(".csv"):
- file_list.append(os.path.join(filepath, filename))
- return file_list
- # 加载文件函数
- def get_text(dir_path):
- file_lst = get_files(dir_path)
- docs = []
- metadatas = []
- ids = []
- embeddings = []
- for one_file in tqdm(file_lst):
- file_type = one_file.split('.')[-1]
- # 尝试检测文件编码
- with open(one_file, 'rb') as f:
- rawdata = f.read()
- encoding = chardet.detect(rawdata)['encoding']
- print(f"Detected encoding for {one_file}: {encoding}")
- if file_type == 'csv':
- df = pd.read_csv(one_file)
- for index, row in df.iterrows():
- output_str = ""
- text = row[1]
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
- all_splits = text_splitter.split_text(text)
- for split_text in all_splits:
- split_text = row[0] + " " + split_text
- my_list = []
- my_list.append(output_str)
- response = erniebot.Embedding.create(
- model='ernie-text-embedding',
- input=my_list)
- embeddings.append(response.data[0].embedding)
- docs.append(output_str)
- metadatas.append({"source": one_file.split(".")[0]})
- ids.append(f"id{index}")
- else:
- continue
- return docs
- # 目标文件夹
- tar_dir = [
- "files/",
- ]
- # 加载目标文件
- docs = []
- for dir_path in tar_dir:
- print(get_text(dir_path))
- docs.extend(get_text(dir_path))
- print(docs)
- #embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding")
- # 构建向量数据库
- # 定义持久化路径
- persist_directory = 'data_base/vector_db/chroma'
- # 加载数据库
- vectordb = Chroma.from_documents(
- documents=docs,
- embedding=embeddings,
- persist_directory=persist_directory # 允许我们将persist_directory目录保存到磁盘上
- )
- # 将加载的向量数据库持久化到磁盘上
- vectordb.persist()
|