跳到主要內容

Fleet AI Context

Fleet AI Context 是一個高品質嵌入資料集,包含前 1200 個最受歡迎且授權寬鬆的 Python 函式庫及其文檔。

Fleet AI 團隊的使命是嵌入世界上最重要的資料。他們從嵌入前 1200 個 Python 函式庫開始,以實現具有最新知識的程式碼生成。他們很友善地分享了 LangChain 文檔API 參考文檔的嵌入。

讓我們看看如何使用這些嵌入來驅動文檔檢索系統,並最終驅動一個簡單的程式碼生成鏈!

%pip install --upgrade --quiet  langchain fleet-context langchain-openai pandas faiss-cpu # faiss-gpu for CUDA supported GPU
from operator import itemgetter
from typing import Any, Optional, Type

import pandas as pd
from langchain.retrievers import MultiVectorRetriever
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.stores import BaseStore
from langchain_core.vectorstores import VectorStore
from langchain_openai import OpenAIEmbeddings


def load_fleet_retriever(
df: pd.DataFrame,
*,
vectorstore_cls: Type[VectorStore] = FAISS,
docstore: Optional[BaseStore] = None,
**kwargs: Any,
):
vectorstore = _populate_vectorstore(df, vectorstore_cls)
if docstore is None:
return vectorstore.as_retriever(**kwargs)
else:
_populate_docstore(df, docstore)
return MultiVectorRetriever(
vectorstore=vectorstore, docstore=docstore, id_key="parent", **kwargs
)


def _populate_vectorstore(
df: pd.DataFrame,
vectorstore_cls: Type[VectorStore],
) -> VectorStore:
if not hasattr(vectorstore_cls, "from_embeddings"):
raise ValueError(
f"Incompatible vector store class {vectorstore_cls}."
"Must implement `from_embeddings` class method."
)
texts_embeddings = []
metadatas = []
for _, row in df.iterrows():
texts_embeddings.append((row.metadata["text"], row["dense_embeddings"]))
metadatas.append(row.metadata)
return vectorstore_cls.from_embeddings(
texts_embeddings,
OpenAIEmbeddings(model="text-embedding-ada-002"),
metadatas=metadatas,
)


def _populate_docstore(df: pd.DataFrame, docstore: BaseStore) -> None:
parent_docs = []
df = df.copy()
df["parent"] = df.metadata.apply(itemgetter("parent"))
for parent_id, group in df.groupby("parent"):
sorted_group = group.iloc[
group.metadata.apply(itemgetter("section_index")).argsort()
]
text = "".join(sorted_group.metadata.apply(itemgetter("text")))
metadata = {
k: sorted_group.iloc[0].metadata[k] for k in ("title", "type", "url")
}
text = metadata["title"] + "\n" + text
metadata["id"] = parent_id
parent_docs.append(Document(page_content=text, metadata=metadata))
docstore.mset(((d.metadata["id"], d) for d in parent_docs))

檢索器區塊

作為其嵌入過程的一部分,Fleet AI 團隊首先對長文檔進行分塊,然後再進行嵌入。這表示向量對應於 LangChain 文檔中頁面的各個部分,而不是整個頁面。預設情況下,當我們從這些嵌入啟動檢索器時,我們將檢索這些嵌入的區塊。

我們將使用 Fleet Context 的 download_embeddings() 來抓取 Langchain 的文檔嵌入。您可以在 https://fleet.so/context 查看所有支援的函式庫的文檔。

from context import download_embeddings

df = download_embeddings("langchain")
vecstore_retriever = load_fleet_retriever(df)
vecstore_retriever.invoke("How does the multi vector retriever work")

其他套件

您可以從這個 Dropbox 連結下載並使用其他嵌入。

檢索父文檔

Fleet AI 提供的嵌入包含元數據,指示哪些嵌入區塊對應於相同的原始文檔頁面。如果我們願意,可以使用此資訊來檢索整個父文檔,而不僅僅是嵌入的區塊。在底層,我們將使用 MultiVectorRetriever 和 BaseStore 物件來搜尋相關的區塊,然後將它們對應到其父文檔。

from langchain.storage import InMemoryStore

parent_retriever = load_fleet_retriever(
"https://www.dropbox.com/scl/fi/4rescpkrg9970s3huz47l/libraries_langchain_release.parquet?rlkey=283knw4wamezfwiidgpgptkep&dl=1",
docstore=InMemoryStore(),
)
API 參考文檔:InMemoryStore
parent_retriever.invoke("How does the multi vector retriever work")

放入鏈中

讓我們嘗試在一個簡單的鏈中使用我們的檢索系統!

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a great software engineer who is very familiar \
with Python. Given a user question or request about a new Python library called LangChain and \
parts of the LangChain documentation, answer the question or generate the requested code. \
Your answers must be accurate, should include code whenever possible, and should assume anything \
about LangChain which is note explicitly stated in the LangChain documentation. If the required \
information is not available, just say so.

LangChain Documentation
------------------

{context}""",
),
("human", "{question}"),
]
)

model = ChatOpenAI(model="gpt-3.5-turbo-16k")

chain = (
{
"question": RunnablePassthrough(),
"context": parent_retriever
| (lambda docs: "\n\n".join(d.page_content for d in docs)),
}
| prompt
| model
| StrOutputParser()
)
for chunk in chain.invoke(
"How do I create a FAISS vector store retriever that returns 10 documents per search query"
):
print(chunk, end="", flush=True)

此頁面是否對您有幫助?