# coding=utf-8 from langchain.document_loaders import UnstructuredFileLoader from langchain.document_loaders import UnstructuredMarkdownLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import MathpixPDFLoader from langchain_community.document_loaders import UnstructuredPDFLoader from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader from langchain_community.document_loaders import Docx2txtLoader from langchain_community.document_loaders.csv_loader import CSVLoader from langchain.text_splitter import MarkdownHeaderTextSplitter from tqdm import tqdm from sentence_transformers import SentenceTransformer, util import os import chardet import erniebot import numpy as np from langchain_embedding_ErnieBotSDK import ErnieEmbeddings erniebot.api_type = "aistudio" erniebot.access_token = "ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f" embeddings=ErnieEmbeddings(access_token="ff1531c8c0f429f92adbc2eaed2e23bfb5349e0f", chunk_size=1) # 获取文件路径函数 def get_files(dir_path): # args:dir_path,目标文件夹路径 file_list = [] for filepath, dirnames, filenames in os.walk(dir_path): # os.walk 函数将递归遍历指定文件夹 for filename in filenames: # 通过后缀名判断文件类型是否满足要求 if filename.endswith(".md"): # 如果满足要求,将其绝对路径加入到结果列表 file_list.append(os.path.join(filepath, filename)) elif filename.endswith(".txt"): file_list.append(os.path.join(filepath, filename)) #elif filename.endswith(".pdf"): #file_list.append(os.path.join(filepath, filename)) elif filename.endswith(".docx"): file_list.append(os.path.join(filepath, filename)) elif filename.endswith(".csv"): file_list.append(os.path.join(filepath, filename)) return file_list # 加载文件函数 def get_text(dir_path): file_lst = get_files(dir_path) docs = [] for one_file in tqdm(file_lst): file_type = one_file.split('.')[-1] # 尝试检测文件编码 with open(one_file, 'rb') as f: rawdata = f.read() encoding = chardet.detect(rawdata)['encoding'] print(f"Detected encoding for {one_file}: {encoding}") # 根据文件类型创建适当的加载器,并指定编码 if file_type == 'md': loader = UnstructuredMarkdownLoader(one_file, encoding=encoding) elif file_type == 'txt': loader = UnstructuredFileLoader(one_file, encoding=encoding) #elif file_type == 'pdf': # loader = PyPDFLoader(one_file) # loader = MathpixPDFLoader(one_file) #loader = UnstructuredPDFLoader(one_file) elif file_type == 'docx': loader = Docx2txtLoader(one_file, encoding=encoding) elif file_type == 'csv': loader = CSVLoader(one_file, encoding=encoding) else: continue # 加载文档 try: docs.extend(loader.load()) except UnicodeDecodeError as e: print(f"Failed to load {one_file} due to encoding error: {str(e)}") continue return docs # 目标文件夹 tar_dir = [ "files/", ] # 加载目标文件 docs = [] for dir_path in tar_dir: print(get_text(dir_path)) docs.extend(get_text(dir_path)) print(docs) #embeddings = HuggingFaceEmbeddings(model_name="/mnt/sdb/zhaoyuan/rongrunxiang/acge_text_embedding") # 构建向量数据库 # 定义持久化路径 persist_directory = 'data_base/vector_db/chroma' # 加载数据库 vectordb = Chroma.from_documents( documents=docs, embedding=embeddings, persist_directory=persist_directory # 允许我们将persist_directory目录保存到磁盘上 ) # 将加载的向量数据库持久化到磁盘上 vectordb.persist()