Methods

Embeddings

Embeddings is the engine that delivers semantic search. Data is transformed into embeddings vectors where similar concepts will produce similar vectors. Indexes both large and small are built with these vectors. The indexes are used to find results that have the same meaning, not necessarily the same keywords.

Source code in txtai/embeddings/base.py

  1. class Embeddings:
  2. """
  3. Embeddings is the engine that delivers semantic search. Data is transformed into embeddings vectors where similar concepts
  4. will produce similar vectors. Indexes both large and small are built with these vectors. The indexes are used to find results
  5. that have the same meaning, not necessarily the same keywords.
  6. """
  7. # pylint: disable = W0231
  8. def __init__(self, config=None):
  9. """
  10. Creates a new embeddings index. Embeddings indexes are thread-safe for read operations but writes must be
  11. synchronized.
  12. Args:
  13. config: embeddings configuration
  14. """
  15. # Index configuration
  16. self.config = None
  17. # Dimensionality reduction and scoring index - word vectors only
  18. self.reducer, self.scoring = None, None
  19. # Embeddings vector model - transforms data into similarity vectors
  20. self.model = None
  21. # Approximate nearest neighbor index
  22. self.ann = None
  23. # Document database
  24. self.database = None
  25. # Resolvable functions
  26. self.functions = None
  27. # Graph network
  28. self.graph = None
  29. # Query model
  30. self.query = None
  31. # Index archive
  32. self.archive = None
  33. # Set initial configuration
  34. self.configure(config)
  35. def score(self, documents):
  36. """
  37. Builds a scoring index. Only used by word vectors models.
  38. Args:
  39. documents: list of (id, data, tags)
  40. """
  41. # Build scoring index over documents
  42. if self.scoring:
  43. self.scoring.index(documents)
  44. def index(self, documents, reindex=False):
  45. """
  46. Builds an embeddings index. This method overwrites an existing index.
  47. Args:
  48. documents: list of (id, data, tags)
  49. reindex: if this is a reindex operation in which case database creation is skipped, defaults to False
  50. """
  51. # Set configuration to default configuration, if empty
  52. if not self.config:
  53. self.configure(self.defaults())
  54. # Create document database, if necessary
  55. if not reindex:
  56. self.database = self.createdatabase()
  57. # Reset archive since this is a new index
  58. self.archive = None
  59. # Create graph, if necessary
  60. self.graph = self.creategraph()
  61. # Create transform action
  62. transform = Transform(self, Action.REINDEX if reindex else Action.INDEX)
  63. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  64. # Load documents into database and transform to vectors
  65. ids, dimensions, embeddings = transform(documents, buffer)
  66. if ids:
  67. # Build LSA model (if enabled). Remove principal components from embeddings.
  68. if self.config.get("pca"):
  69. self.reducer = Reducer(embeddings, self.config["pca"])
  70. self.reducer(embeddings)
  71. # Normalize embeddings
  72. self.normalize(embeddings)
  73. # Save index dimensions
  74. self.config["dimensions"] = dimensions
  75. # Create approximate nearest neighbor index
  76. self.ann = ANNFactory.create(self.config)
  77. # Add embeddings to the index
  78. self.ann.index(embeddings)
  79. # Save indexids-ids mapping for indexes with no database, except when this is a reindex action
  80. if not reindex and not self.database:
  81. self.config["ids"] = ids
  82. # Index graph, if necessary
  83. if self.graph:
  84. self.graph.index(Search(self, True), self.batchsimilarity)
  85. def upsert(self, documents):
  86. """
  87. Runs an embeddings upsert operation. If the index exists, new data is
  88. appended to the index, existing data is updated. If the index doesn't exist,
  89. this method runs a standard index operation.
  90. Args:
  91. documents: list of (id, data, tags)
  92. """
  93. # Run standard insert if index doesn't exist or it has no records
  94. if not self.count():
  95. self.index(documents)
  96. return
  97. # Create transform action
  98. transform = Transform(self, Action.UPSERT)
  99. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  100. # Load documents into database and transform to vectors
  101. ids, _, embeddings = transform(documents, buffer)
  102. if ids:
  103. # Remove principal components from embeddings, if necessary
  104. if self.reducer:
  105. self.reducer(embeddings)
  106. # Normalize embeddings
  107. self.normalize(embeddings)
  108. # Append embeddings to the index
  109. self.ann.append(embeddings)
  110. # Save indexids-ids mapping for indexes with no database
  111. if not self.database:
  112. self.config["ids"] = self.config["ids"] + ids
  113. # Graph upsert, if necessary
  114. if self.graph:
  115. self.graph.upsert(Search(self, True), self.batchsimilarity)
  116. def delete(self, ids):
  117. """
  118. Deletes from an embeddings index. Returns list of ids deleted.
  119. Args:
  120. ids: list of ids to delete
  121. Returns:
  122. list of ids deleted
  123. """
  124. # List of internal indices for each candidate id to delete
  125. indices = []
  126. # List of deleted ids
  127. deletes = []
  128. if self.database:
  129. # Retrieve indexid-id mappings from database
  130. ids = self.database.ids(ids)
  131. # Parse out indices and ids to delete
  132. indices = [i for i, _ in ids]
  133. deletes = sorted(set(uid for _, uid in ids))
  134. # Delete ids from database
  135. self.database.delete(deletes)
  136. elif self.ann:
  137. # Lookup indexids from config for indexes with no database
  138. indexids = self.config["ids"]
  139. # Find existing ids
  140. for uid in ids:
  141. indices.extend([index for index, value in enumerate(indexids) if uid == value])
  142. # Clear config ids
  143. for index in indices:
  144. deletes.append(indexids[index])
  145. indexids[index] = None
  146. # Delete indices from ann embeddings
  147. if indices:
  148. # Delete ids from index
  149. self.ann.delete(indices)
  150. # Delete ids from graph
  151. if self.graph:
  152. self.graph.delete(indices)
  153. return deletes
  154. def reindex(self, config, columns=None, function=None):
  155. """
  156. Recreates the approximate nearest neighbor (ann) index using config. This method only works if document
  157. content storage is enabled.
  158. Args:
  159. config: new config
  160. columns: optional list of document columns used to rebuild data
  161. function: optional function to prepare content for indexing
  162. """
  163. if self.database:
  164. # Keep content and objects parameters to ensure database is preserved
  165. config["content"] = self.config["content"]
  166. if "objects" in self.config:
  167. config["objects"] = self.config["objects"]
  168. # Reset configuration
  169. self.configure(config)
  170. # Reset function references
  171. if self.functions:
  172. self.functions.reset()
  173. # Reindex
  174. if function:
  175. self.index(function(self.database.reindex(columns)), True)
  176. else:
  177. self.index(self.database.reindex(columns), True)
  178. def transform(self, document):
  179. """
  180. Transforms document into an embeddings vector.
  181. Args:
  182. document: (id, data, tags)
  183. Returns:
  184. embeddings vector
  185. """
  186. return self.batchtransform([document])[0]
  187. def batchtransform(self, documents, category=None):
  188. """
  189. Transforms documents into embeddings vectors.
  190. Args:
  191. documents: list of (id, data, tags)
  192. category: category for instruction-based embeddings
  193. Returns:
  194. embeddings vectors
  195. """
  196. # Convert documents into sentence embeddings
  197. embeddings = self.model.batchtransform(documents, category)
  198. # Reduce the dimensionality of the embeddings. Scale the embeddings using this
  199. # model to reduce the noise of common but less relevant terms.
  200. if self.reducer:
  201. self.reducer(embeddings)
  202. # Normalize embeddings
  203. self.normalize(embeddings)
  204. return embeddings
  205. def count(self):
  206. """
  207. Total number of elements in this embeddings index.
  208. Returns:
  209. number of elements in this embeddings index
  210. """
  211. return self.ann.count() if self.ann else 0
  212. def search(self, query, limit=None):
  213. """
  214. Finds documents most similar to the input queries. This method will run either an approximate
  215. nearest neighbor (ann) search or an approximate nearest neighbor + database search depending
  216. on if a database is available.
  217. Args:
  218. query: input query
  219. limit: maximum results
  220. Returns:
  221. list of (id, score) for ann search, list of dict for an ann+database search
  222. """
  223. results = self.batchsearch([query], limit)
  224. return results[0] if results else results
  225. def batchsearch(self, queries, limit=None):
  226. """
  227. Finds documents most similar to the input queries. This method will run either an approximate
  228. nearest neighbor (ann) search or an approximate nearest neighbor + database search depending
  229. on if a database is available.
  230. Args:
  231. queries: input queries
  232. limit: maximum results
  233. Returns:
  234. list of (id, score) per query for ann search, list of dict per query for an ann+database search
  235. """
  236. return Search(self)(queries, limit if limit else 3)
  237. def similarity(self, query, data):
  238. """
  239. Computes the similarity between query and list of data. Returns a list of
  240. (id, score) sorted by highest score, where id is the index in data.
  241. Args:
  242. query: input query
  243. data: list of data
  244. Returns:
  245. list of (id, score)
  246. """
  247. return self.batchsimilarity([query], data)[0]
  248. def batchsimilarity(self, queries, data):
  249. """
  250. Computes the similarity between list of queries and list of data. Returns a list
  251. of (id, score) sorted by highest score per query, where id is the index in data.
  252. Args:
  253. queries: input queries
  254. data: list of data
  255. Returns:
  256. list of (id, score) per query
  257. """
  258. # Convert queries to embedding vectors
  259. queries = self.batchtransform(((None, query, None) for query in queries), "query")
  260. data = self.batchtransform(((None, row, None) for row in data), "data")
  261. # Dot product on normalized vectors is equal to cosine similarity
  262. scores = np.dot(queries, data.T).tolist()
  263. # Add index and sort desc based on score
  264. return [sorted(enumerate(score), key=lambda x: x[1], reverse=True) for score in scores]
  265. def explain(self, query, texts=None, limit=None):
  266. """
  267. Explains the importance of each input token in text for a query.
  268. Args:
  269. query: input query
  270. texts: optional list of (text|list of tokens), otherwise runs search query
  271. limit: optional limit if texts is None
  272. Returns:
  273. list of dict per input text where a higher token scores represents higher importance relative to the query
  274. """
  275. results = self.batchexplain([query], texts, limit)
  276. return results[0] if results else results
  277. def batchexplain(self, queries, texts=None, limit=None):
  278. """
  279. Explains the importance of each input token in text for a list of queries.
  280. Args:
  281. queries: input queries
  282. texts: optional list of (text|list of tokens), otherwise runs search queries
  283. limit: optional limit if texts is None
  284. Returns:
  285. list of dict per input text per query where a higher token scores represents higher importance relative to the query
  286. """
  287. return Explain(self)(queries, texts, limit)
  288. def terms(self, query):
  289. """
  290. Extracts keyword terms from a query.
  291. Args:
  292. query: input query
  293. Returns:
  294. query reduced down to keyword terms
  295. """
  296. return self.batchterms([query])[0]
  297. def batchterms(self, queries):
  298. """
  299. Extracts keyword terms from a list of queries.
  300. Args:
  301. queries: list of queries
  302. Returns:
  303. list of queries reduced down to keyword term strings
  304. """
  305. return Terms(self)(queries)
  306. def exists(self, path=None, cloud=None, **kwargs):
  307. """
  308. Checks if an index exists at path.
  309. Args:
  310. path: input path
  311. cloud: cloud storage configuration
  312. kwargs: additional configuration as keyword args
  313. Returns:
  314. True if index exists, False otherwise
  315. """
  316. # Check if this exists in a cloud instance
  317. cloud = self.createcloud(cloud=cloud, **kwargs)
  318. if cloud:
  319. return cloud.exists(path)
  320. # Check if this is an archive file and exists
  321. path, apath = self.checkarchive(path)
  322. if apath:
  323. return os.path.exists(apath)
  324. # Return true if path has a config or config.json file and an embeddings file
  325. return path and (os.path.exists(f"{path}/config") or os.path.exists(f"{path}/config.json")) and os.path.exists(f"{path}/embeddings")
  326. def load(self, path=None, cloud=None, **kwargs):
  327. """
  328. Loads an existing index from path.
  329. Args:
  330. path: input path
  331. cloud: cloud storage configuration
  332. kwargs: additional configuration as keyword args
  333. """
  334. # Load from cloud, if configured
  335. cloud = self.createcloud(cloud=cloud, **kwargs)
  336. if cloud:
  337. path = cloud.load(path)
  338. # Check if this is an archive file and extract
  339. path, apath = self.checkarchive(path)
  340. if apath:
  341. self.archive.load(apath)
  342. # Load index configuration
  343. self.config = self.loadconfig(path)
  344. # Approximate nearest neighbor index - stores embeddings vectors
  345. self.ann = ANNFactory.create(self.config)
  346. self.ann.load(f"{path}/embeddings")
  347. # Dimensionality reduction model - word vectors only
  348. if self.config.get("pca"):
  349. self.reducer = Reducer()
  350. self.reducer.load(f"{path}/lsa")
  351. # Embedding scoring index - word vectors only
  352. if self.config.get("scoring"):
  353. self.scoring = ScoringFactory.create(self.config["scoring"])
  354. self.scoring.load(f"{path}/scoring")
  355. # Sentence vectors model - transforms data to embeddings vectors
  356. self.model = self.loadvectors()
  357. # Query model
  358. self.query = self.loadquery()
  359. # Document database - stores document content
  360. self.database = self.createdatabase()
  361. if self.database:
  362. self.database.load(f"{path}/documents")
  363. # Graph network - stores relationships
  364. self.graph = self.creategraph()
  365. if self.graph:
  366. self.graph.load(f"{path}/graph")
  367. def save(self, path, cloud=None, **kwargs):
  368. """
  369. Saves an index in a directory at path unless path ends with tar.gz, tar.bz2, tar.xz or zip.
  370. In those cases, the index is stored as a compressed file.
  371. Args:
  372. path: output path
  373. cloud: cloud storage configuration
  374. kwargs: additional configuration as keyword args
  375. """
  376. if self.config:
  377. # Check if this is an archive file
  378. path, apath = self.checkarchive(path)
  379. # Create output directory, if necessary
  380. os.makedirs(path, exist_ok=True)
  381. # Copy sentence vectors model
  382. if self.config.get("storevectors"):
  383. shutil.copyfile(self.config["path"], os.path.join(path, os.path.basename(self.config["path"])))
  384. self.config["path"] = os.path.basename(self.config["path"])
  385. # Save index configuration
  386. self.saveconfig(path)
  387. # Save approximate nearest neighbor index
  388. self.ann.save(f"{path}/embeddings")
  389. # Save dimensionality reduction model (word vectors only)
  390. if self.reducer:
  391. self.reducer.save(f"{path}/lsa")
  392. # Save embedding scoring index (word vectors only)
  393. if self.scoring:
  394. self.scoring.save(f"{path}/scoring")
  395. # Save document database
  396. if self.database:
  397. self.database.save(f"{path}/documents")
  398. # Save graph
  399. if self.graph:
  400. self.graph.save(f"{path}/graph")
  401. # If this is an archive, save it
  402. if apath:
  403. self.archive.save(apath)
  404. # Save to cloud, if configured
  405. cloud = self.createcloud(cloud=cloud, **kwargs)
  406. if cloud:
  407. cloud.save(apath if apath else path)
  408. def close(self):
  409. """
  410. Closes this embeddings index and frees all resources.
  411. """
  412. self.config, self.reducer, self.scoring, self.model = None, None, None, None
  413. self.ann, self.graph, self.query, self.archive = None, None, None, None
  414. # Close database connection if open
  415. if self.database:
  416. self.database.close()
  417. self.database, self.functions = None, None
  418. def info(self):
  419. """
  420. Prints the current embeddings index configuration.
  421. """
  422. # Copy and edit config
  423. config = self.config.copy()
  424. # Remove ids array if present
  425. config.pop("ids", None)
  426. # Print configuration
  427. print(json.dumps(config, sort_keys=True, default=str, indent=2))
  428. def configure(self, config):
  429. """
  430. Sets the configuration for this embeddings index and loads config-driven models.
  431. Args:
  432. config: embeddings configuration
  433. """
  434. # Configuration
  435. self.config = config
  436. if self.config and self.config.get("method") != "transformers":
  437. # Dimensionality reduction model
  438. self.reducer = None
  439. # Embedding scoring method - weighs each word in a sentence
  440. self.scoring = ScoringFactory.create(self.config["scoring"]) if self.config and self.config.get("scoring") else None
  441. else:
  442. self.reducer, self.scoring = None, None
  443. # Sentence vectors model - transforms data to embeddings vectors
  444. self.model = self.loadvectors() if self.config else None
  445. # Query model
  446. self.query = self.loadquery() if self.config else None
  447. def defaults(self):
  448. """
  449. Builds a default configuration.
  450. Returns:
  451. default configuration
  452. """
  453. return {"path": "sentence-transformers/all-MiniLM-L6-v2"}
  454. def loadconfig(self, path):
  455. """
  456. Loads index configuration. This method supports both config pickle files and config.json files.
  457. Args:
  458. path: path to directory
  459. Returns:
  460. dict
  461. """
  462. # Configuration
  463. config = None
  464. # Determine if config is json or pickle
  465. jsonconfig = os.path.exists(f"{path}/config.json")
  466. # Set config file name
  467. name = "config.json" if jsonconfig else "config"
  468. # Load configuration
  469. with open(f"{path}/{name}", "r" if jsonconfig else "rb") as handle:
  470. config = json.load(handle) if jsonconfig else pickle.load(handle)
  471. # Build full path to embedding vectors file
  472. if config.get("storevectors"):
  473. config["path"] = os.path.join(path, config["path"])
  474. return config
  475. def saveconfig(self, path):
  476. """
  477. Saves index configuration. This method saves to JSON if possible, otherwise it falls back to pickle.
  478. Args:
  479. path: path to directory
  480. Returns:
  481. dict
  482. """
  483. # Default to pickle config
  484. jsonconfig = self.config.get("format", "pickle") == "json"
  485. # Set config file name
  486. name = "config.json" if jsonconfig else "config"
  487. # Write configuration
  488. with open(f"{path}/{name}", "w" if jsonconfig else "wb", encoding="utf-8" if jsonconfig else None) as handle:
  489. if jsonconfig:
  490. # Write config as JSON
  491. json.dump(self.config, handle, default=str, indent=2)
  492. else:
  493. # Write config as pickle format
  494. pickle.dump(self.config, handle, protocol=__pickle__)
  495. def loadvectors(self):
  496. """
  497. Loads a vector model set in config.
  498. Returns:
  499. vector model
  500. """
  501. return VectorsFactory.create(self.config, self.scoring)
  502. def loadquery(self):
  503. """
  504. Loads a query model set in config.
  505. Returns:
  506. query model
  507. """
  508. if "query" in self.config:
  509. return Query(**self.config["query"])
  510. return None
  511. def checkarchive(self, path):
  512. """
  513. Checks if path is an archive file.
  514. Args:
  515. path: path to check
  516. Returns:
  517. (working directory, current path) if this is an archive, original path otherwise
  518. """
  519. # Create archive instance, if necessary
  520. self.archive = ArchiveFactory.create()
  521. # Check if path is an archive file
  522. if self.archive.isarchive(path):
  523. # Return temporary archive working directory and original path
  524. return self.archive.path(), path
  525. return path, None
  526. def createcloud(self, **cloud):
  527. """
  528. Creates a cloud instance from config.
  529. Args:
  530. cloud: cloud configuration
  531. """
  532. # Merge keyword args and keys under the cloud parameter
  533. config = cloud
  534. if "cloud" in config and config["cloud"]:
  535. config.update(config.pop("cloud"))
  536. # Create cloud instance from config and return
  537. return CloudFactory.create(config) if config else None
  538. def createdatabase(self):
  539. """
  540. Creates a database from config. This method will also close any existing database connection.
  541. Returns:
  542. new database, if enabled in config
  543. """
  544. # Free existing database resources
  545. if self.database:
  546. self.database.close()
  547. config = self.config.copy()
  548. # Create references to callable functions
  549. self.functions = Functions(self) if "functions" in config else None
  550. if self.functions:
  551. config["functions"] = self.functions(config)
  552. # Create database from config and return
  553. return DatabaseFactory.create(config)
  554. def creategraph(self):
  555. """
  556. Creates a graph from config.
  557. Returns:
  558. new graph, if enabled in config
  559. """
  560. return GraphFactory.create(self.config["graph"]) if "graph" in self.config else None
  561. def normalize(self, embeddings):
  562. """
  563. Normalizes embeddings using L2 normalization. Operation applied directly on array.
  564. Args:
  565. embeddings: input embeddings matrix
  566. """
  567. # Calculation is different for matrices vs vectors
  568. if len(embeddings.shape) > 1:
  569. embeddings /= np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
  570. else:
  571. embeddings /= np.linalg.norm(embeddings)

__init__(self, config=None) special

Creates a new embeddings index. Embeddings indexes are thread-safe for read operations but writes must be synchronized.

Parameters:

NameTypeDescriptionDefault
config

embeddings configuration

None

Source code in txtai/embeddings/base.py

  1. def __init__(self, config=None):
  2. """
  3. Creates a new embeddings index. Embeddings indexes are thread-safe for read operations but writes must be
  4. synchronized.
  5. Args:
  6. config: embeddings configuration
  7. """
  8. # Index configuration
  9. self.config = None
  10. # Dimensionality reduction and scoring index - word vectors only
  11. self.reducer, self.scoring = None, None
  12. # Embeddings vector model - transforms data into similarity vectors
  13. self.model = None
  14. # Approximate nearest neighbor index
  15. self.ann = None
  16. # Document database
  17. self.database = None
  18. # Resolvable functions
  19. self.functions = None
  20. # Graph network
  21. self.graph = None
  22. # Query model
  23. self.query = None
  24. # Index archive
  25. self.archive = None
  26. # Set initial configuration
  27. self.configure(config)

batchexplain(self, queries, texts=None, limit=None)

Explains the importance of each input token in text for a list of queries.

Parameters:

NameTypeDescriptionDefault
queries

input queries

required
texts

optional list of (text|list of tokens), otherwise runs search queries

None
limit

optional limit if texts is None

None

Returns:

TypeDescription

list of dict per input text per query where a higher token scores represents higher importance relative to the query

Source code in txtai/embeddings/base.py

  1. def batchexplain(self, queries, texts=None, limit=None):
  2. """
  3. Explains the importance of each input token in text for a list of queries.
  4. Args:
  5. queries: input queries
  6. texts: optional list of (text|list of tokens), otherwise runs search queries
  7. limit: optional limit if texts is None
  8. Returns:
  9. list of dict per input text per query where a higher token scores represents higher importance relative to the query
  10. """
  11. return Explain(self)(queries, texts, limit)

batchsearch(self, queries, limit=None)

Finds documents most similar to the input queries. This method will run either an approximate nearest neighbor (ann) search or an approximate nearest neighbor + database search depending on if a database is available.

Parameters:

NameTypeDescriptionDefault
queries

input queries

required
limit

maximum results

None

Returns:

TypeDescription

list of (id, score) per query for ann search, list of dict per query for an ann+database search

Source code in txtai/embeddings/base.py

  1. def batchsearch(self, queries, limit=None):
  2. """
  3. Finds documents most similar to the input queries. This method will run either an approximate
  4. nearest neighbor (ann) search or an approximate nearest neighbor + database search depending
  5. on if a database is available.
  6. Args:
  7. queries: input queries
  8. limit: maximum results
  9. Returns:
  10. list of (id, score) per query for ann search, list of dict per query for an ann+database search
  11. """
  12. return Search(self)(queries, limit if limit else 3)

batchsimilarity(self, queries, data)

Computes the similarity between list of queries and list of data. Returns a list of (id, score) sorted by highest score per query, where id is the index in data.

Parameters:

NameTypeDescriptionDefault
queries

input queries

required
data

list of data

required

Returns:

TypeDescription

list of (id, score) per query

Source code in txtai/embeddings/base.py

  1. def batchsimilarity(self, queries, data):
  2. """
  3. Computes the similarity between list of queries and list of data. Returns a list
  4. of (id, score) sorted by highest score per query, where id is the index in data.
  5. Args:
  6. queries: input queries
  7. data: list of data
  8. Returns:
  9. list of (id, score) per query
  10. """
  11. # Convert queries to embedding vectors
  12. queries = self.batchtransform(((None, query, None) for query in queries), "query")
  13. data = self.batchtransform(((None, row, None) for row in data), "data")
  14. # Dot product on normalized vectors is equal to cosine similarity
  15. scores = np.dot(queries, data.T).tolist()
  16. # Add index and sort desc based on score
  17. return [sorted(enumerate(score), key=lambda x: x[1], reverse=True) for score in scores]

batchterms(self, queries)

Extracts keyword terms from a list of queries.

Parameters:

NameTypeDescriptionDefault
queries

list of queries

required

Returns:

TypeDescription

list of queries reduced down to keyword term strings

Source code in txtai/embeddings/base.py

  1. def batchterms(self, queries):
  2. """
  3. Extracts keyword terms from a list of queries.
  4. Args:
  5. queries: list of queries
  6. Returns:
  7. list of queries reduced down to keyword term strings
  8. """
  9. return Terms(self)(queries)

batchtransform(self, documents, category=None)

Transforms documents into embeddings vectors.

Parameters:

NameTypeDescriptionDefault
documents

list of (id, data, tags)

required
category

category for instruction-based embeddings

None

Returns:

TypeDescription

embeddings vectors

Source code in txtai/embeddings/base.py

  1. def batchtransform(self, documents, category=None):
  2. """
  3. Transforms documents into embeddings vectors.
  4. Args:
  5. documents: list of (id, data, tags)
  6. category: category for instruction-based embeddings
  7. Returns:
  8. embeddings vectors
  9. """
  10. # Convert documents into sentence embeddings
  11. embeddings = self.model.batchtransform(documents, category)
  12. # Reduce the dimensionality of the embeddings. Scale the embeddings using this
  13. # model to reduce the noise of common but less relevant terms.
  14. if self.reducer:
  15. self.reducer(embeddings)
  16. # Normalize embeddings
  17. self.normalize(embeddings)
  18. return embeddings

close(self)

Closes this embeddings index and frees all resources.

Source code in txtai/embeddings/base.py

  1. def close(self):
  2. """
  3. Closes this embeddings index and frees all resources.
  4. """
  5. self.config, self.reducer, self.scoring, self.model = None, None, None, None
  6. self.ann, self.graph, self.query, self.archive = None, None, None, None
  7. # Close database connection if open
  8. if self.database:
  9. self.database.close()
  10. self.database, self.functions = None, None

count(self)

Total number of elements in this embeddings index.

Returns:

TypeDescription

number of elements in this embeddings index

Source code in txtai/embeddings/base.py

  1. def count(self):
  2. """
  3. Total number of elements in this embeddings index.
  4. Returns:
  5. number of elements in this embeddings index
  6. """
  7. return self.ann.count() if self.ann else 0

delete(self, ids)

Deletes from an embeddings index. Returns list of ids deleted.

Parameters:

NameTypeDescriptionDefault
ids

list of ids to delete

required

Returns:

TypeDescription

list of ids deleted

Source code in txtai/embeddings/base.py

  1. def delete(self, ids):
  2. """
  3. Deletes from an embeddings index. Returns list of ids deleted.
  4. Args:
  5. ids: list of ids to delete
  6. Returns:
  7. list of ids deleted
  8. """
  9. # List of internal indices for each candidate id to delete
  10. indices = []
  11. # List of deleted ids
  12. deletes = []
  13. if self.database:
  14. # Retrieve indexid-id mappings from database
  15. ids = self.database.ids(ids)
  16. # Parse out indices and ids to delete
  17. indices = [i for i, _ in ids]
  18. deletes = sorted(set(uid for _, uid in ids))
  19. # Delete ids from database
  20. self.database.delete(deletes)
  21. elif self.ann:
  22. # Lookup indexids from config for indexes with no database
  23. indexids = self.config["ids"]
  24. # Find existing ids
  25. for uid in ids:
  26. indices.extend([index for index, value in enumerate(indexids) if uid == value])
  27. # Clear config ids
  28. for index in indices:
  29. deletes.append(indexids[index])
  30. indexids[index] = None
  31. # Delete indices from ann embeddings
  32. if indices:
  33. # Delete ids from index
  34. self.ann.delete(indices)
  35. # Delete ids from graph
  36. if self.graph:
  37. self.graph.delete(indices)
  38. return deletes

exists(self, path=None, cloud=None, **kwargs)

Checks if an index exists at path.

Parameters:

NameTypeDescriptionDefault
path

input path

None
cloud

cloud storage configuration

None
kwargs

additional configuration as keyword args

{}

Returns:

TypeDescription

True if index exists, False otherwise

Source code in txtai/embeddings/base.py

  1. def exists(self, path=None, cloud=None, **kwargs):
  2. """
  3. Checks if an index exists at path.
  4. Args:
  5. path: input path
  6. cloud: cloud storage configuration
  7. kwargs: additional configuration as keyword args
  8. Returns:
  9. True if index exists, False otherwise
  10. """
  11. # Check if this exists in a cloud instance
  12. cloud = self.createcloud(cloud=cloud, **kwargs)
  13. if cloud:
  14. return cloud.exists(path)
  15. # Check if this is an archive file and exists
  16. path, apath = self.checkarchive(path)
  17. if apath:
  18. return os.path.exists(apath)
  19. # Return true if path has a config or config.json file and an embeddings file
  20. return path and (os.path.exists(f"{path}/config") or os.path.exists(f"{path}/config.json")) and os.path.exists(f"{path}/embeddings")

explain(self, query, texts=None, limit=None)

Explains the importance of each input token in text for a query.

Parameters:

NameTypeDescriptionDefault
query

input query

required
texts

optional list of (text|list of tokens), otherwise runs search query

None
limit

optional limit if texts is None

None

Returns:

TypeDescription

list of dict per input text where a higher token scores represents higher importance relative to the query

Source code in txtai/embeddings/base.py

  1. def explain(self, query, texts=None, limit=None):
  2. """
  3. Explains the importance of each input token in text for a query.
  4. Args:
  5. query: input query
  6. texts: optional list of (text|list of tokens), otherwise runs search query
  7. limit: optional limit if texts is None
  8. Returns:
  9. list of dict per input text where a higher token scores represents higher importance relative to the query
  10. """
  11. results = self.batchexplain([query], texts, limit)
  12. return results[0] if results else results

index(self, documents, reindex=False)

Builds an embeddings index. This method overwrites an existing index.

Parameters:

NameTypeDescriptionDefault
documents

list of (id, data, tags)

required
reindex

if this is a reindex operation in which case database creation is skipped, defaults to False

False

Source code in txtai/embeddings/base.py

  1. def index(self, documents, reindex=False):
  2. """
  3. Builds an embeddings index. This method overwrites an existing index.
  4. Args:
  5. documents: list of (id, data, tags)
  6. reindex: if this is a reindex operation in which case database creation is skipped, defaults to False
  7. """
  8. # Set configuration to default configuration, if empty
  9. if not self.config:
  10. self.configure(self.defaults())
  11. # Create document database, if necessary
  12. if not reindex:
  13. self.database = self.createdatabase()
  14. # Reset archive since this is a new index
  15. self.archive = None
  16. # Create graph, if necessary
  17. self.graph = self.creategraph()
  18. # Create transform action
  19. transform = Transform(self, Action.REINDEX if reindex else Action.INDEX)
  20. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  21. # Load documents into database and transform to vectors
  22. ids, dimensions, embeddings = transform(documents, buffer)
  23. if ids:
  24. # Build LSA model (if enabled). Remove principal components from embeddings.
  25. if self.config.get("pca"):
  26. self.reducer = Reducer(embeddings, self.config["pca"])
  27. self.reducer(embeddings)
  28. # Normalize embeddings
  29. self.normalize(embeddings)
  30. # Save index dimensions
  31. self.config["dimensions"] = dimensions
  32. # Create approximate nearest neighbor index
  33. self.ann = ANNFactory.create(self.config)
  34. # Add embeddings to the index
  35. self.ann.index(embeddings)
  36. # Save indexids-ids mapping for indexes with no database, except when this is a reindex action
  37. if not reindex and not self.database:
  38. self.config["ids"] = ids
  39. # Index graph, if necessary
  40. if self.graph:
  41. self.graph.index(Search(self, True), self.batchsimilarity)

info(self)

Prints the current embeddings index configuration.

Source code in txtai/embeddings/base.py

  1. def info(self):
  2. """
  3. Prints the current embeddings index configuration.
  4. """
  5. # Copy and edit config
  6. config = self.config.copy()
  7. # Remove ids array if present
  8. config.pop("ids", None)
  9. # Print configuration
  10. print(json.dumps(config, sort_keys=True, default=str, indent=2))

load(self, path=None, cloud=None, **kwargs)

Loads an existing index from path.

Parameters:

NameTypeDescriptionDefault
path

input path

None
cloud

cloud storage configuration

None
kwargs

additional configuration as keyword args

{}

Source code in txtai/embeddings/base.py

  1. def load(self, path=None, cloud=None, **kwargs):
  2. """
  3. Loads an existing index from path.
  4. Args:
  5. path: input path
  6. cloud: cloud storage configuration
  7. kwargs: additional configuration as keyword args
  8. """
  9. # Load from cloud, if configured
  10. cloud = self.createcloud(cloud=cloud, **kwargs)
  11. if cloud:
  12. path = cloud.load(path)
  13. # Check if this is an archive file and extract
  14. path, apath = self.checkarchive(path)
  15. if apath:
  16. self.archive.load(apath)
  17. # Load index configuration
  18. self.config = self.loadconfig(path)
  19. # Approximate nearest neighbor index - stores embeddings vectors
  20. self.ann = ANNFactory.create(self.config)
  21. self.ann.load(f"{path}/embeddings")
  22. # Dimensionality reduction model - word vectors only
  23. if self.config.get("pca"):
  24. self.reducer = Reducer()
  25. self.reducer.load(f"{path}/lsa")
  26. # Embedding scoring index - word vectors only
  27. if self.config.get("scoring"):
  28. self.scoring = ScoringFactory.create(self.config["scoring"])
  29. self.scoring.load(f"{path}/scoring")
  30. # Sentence vectors model - transforms data to embeddings vectors
  31. self.model = self.loadvectors()
  32. # Query model
  33. self.query = self.loadquery()
  34. # Document database - stores document content
  35. self.database = self.createdatabase()
  36. if self.database:
  37. self.database.load(f"{path}/documents")
  38. # Graph network - stores relationships
  39. self.graph = self.creategraph()
  40. if self.graph:
  41. self.graph.load(f"{path}/graph")

reindex(self, config, columns=None, function=None)

Recreates the approximate nearest neighbor (ann) index using config. This method only works if document content storage is enabled.

Parameters:

NameTypeDescriptionDefault
config

new config

required
columns

optional list of document columns used to rebuild data

None
function

optional function to prepare content for indexing

None

Source code in txtai/embeddings/base.py

  1. def reindex(self, config, columns=None, function=None):
  2. """
  3. Recreates the approximate nearest neighbor (ann) index using config. This method only works if document
  4. content storage is enabled.
  5. Args:
  6. config: new config
  7. columns: optional list of document columns used to rebuild data
  8. function: optional function to prepare content for indexing
  9. """
  10. if self.database:
  11. # Keep content and objects parameters to ensure database is preserved
  12. config["content"] = self.config["content"]
  13. if "objects" in self.config:
  14. config["objects"] = self.config["objects"]
  15. # Reset configuration
  16. self.configure(config)
  17. # Reset function references
  18. if self.functions:
  19. self.functions.reset()
  20. # Reindex
  21. if function:
  22. self.index(function(self.database.reindex(columns)), True)
  23. else:
  24. self.index(self.database.reindex(columns), True)

save(self, path, cloud=None, **kwargs)

Saves an index in a directory at path unless path ends with tar.gz, tar.bz2, tar.xz or zip. In those cases, the index is stored as a compressed file.

Parameters:

NameTypeDescriptionDefault
path

output path

required
cloud

cloud storage configuration

None
kwargs

additional configuration as keyword args

{}

Source code in txtai/embeddings/base.py

  1. def save(self, path, cloud=None, **kwargs):
  2. """
  3. Saves an index in a directory at path unless path ends with tar.gz, tar.bz2, tar.xz or zip.
  4. In those cases, the index is stored as a compressed file.
  5. Args:
  6. path: output path
  7. cloud: cloud storage configuration
  8. kwargs: additional configuration as keyword args
  9. """
  10. if self.config:
  11. # Check if this is an archive file
  12. path, apath = self.checkarchive(path)
  13. # Create output directory, if necessary
  14. os.makedirs(path, exist_ok=True)
  15. # Copy sentence vectors model
  16. if self.config.get("storevectors"):
  17. shutil.copyfile(self.config["path"], os.path.join(path, os.path.basename(self.config["path"])))
  18. self.config["path"] = os.path.basename(self.config["path"])
  19. # Save index configuration
  20. self.saveconfig(path)
  21. # Save approximate nearest neighbor index
  22. self.ann.save(f"{path}/embeddings")
  23. # Save dimensionality reduction model (word vectors only)
  24. if self.reducer:
  25. self.reducer.save(f"{path}/lsa")
  26. # Save embedding scoring index (word vectors only)
  27. if self.scoring:
  28. self.scoring.save(f"{path}/scoring")
  29. # Save document database
  30. if self.database:
  31. self.database.save(f"{path}/documents")
  32. # Save graph
  33. if self.graph:
  34. self.graph.save(f"{path}/graph")
  35. # If this is an archive, save it
  36. if apath:
  37. self.archive.save(apath)
  38. # Save to cloud, if configured
  39. cloud = self.createcloud(cloud=cloud, **kwargs)
  40. if cloud:
  41. cloud.save(apath if apath else path)

score(self, documents)

Builds a scoring index. Only used by word vectors models.

Parameters:

NameTypeDescriptionDefault
documents

list of (id, data, tags)

required

Source code in txtai/embeddings/base.py

  1. def score(self, documents):
  2. """
  3. Builds a scoring index. Only used by word vectors models.
  4. Args:
  5. documents: list of (id, data, tags)
  6. """
  7. # Build scoring index over documents
  8. if self.scoring:
  9. self.scoring.index(documents)

search(self, query, limit=None)

Finds documents most similar to the input queries. This method will run either an approximate nearest neighbor (ann) search or an approximate nearest neighbor + database search depending on if a database is available.

Parameters:

NameTypeDescriptionDefault
query

input query

required
limit

maximum results

None

Returns:

TypeDescription

list of (id, score) for ann search, list of dict for an ann+database search

Source code in txtai/embeddings/base.py

  1. def search(self, query, limit=None):
  2. """
  3. Finds documents most similar to the input queries. This method will run either an approximate
  4. nearest neighbor (ann) search or an approximate nearest neighbor + database search depending
  5. on if a database is available.
  6. Args:
  7. query: input query
  8. limit: maximum results
  9. Returns:
  10. list of (id, score) for ann search, list of dict for an ann+database search
  11. """
  12. results = self.batchsearch([query], limit)
  13. return results[0] if results else results

similarity(self, query, data)

Computes the similarity between query and list of data. Returns a list of (id, score) sorted by highest score, where id is the index in data.

Parameters:

NameTypeDescriptionDefault
query

input query

required
data

list of data

required

Returns:

TypeDescription

list of (id, score)

Source code in txtai/embeddings/base.py

  1. def similarity(self, query, data):
  2. """
  3. Computes the similarity between query and list of data. Returns a list of
  4. (id, score) sorted by highest score, where id is the index in data.
  5. Args:
  6. query: input query
  7. data: list of data
  8. Returns:
  9. list of (id, score)
  10. """
  11. return self.batchsimilarity([query], data)[0]

terms(self, query)

Extracts keyword terms from a query.

Parameters:

NameTypeDescriptionDefault
query

input query

required

Returns:

TypeDescription

query reduced down to keyword terms

Source code in txtai/embeddings/base.py

  1. def terms(self, query):
  2. """
  3. Extracts keyword terms from a query.
  4. Args:
  5. query: input query
  6. Returns:
  7. query reduced down to keyword terms
  8. """
  9. return self.batchterms([query])[0]

transform(self, document)

Transforms document into an embeddings vector.

Parameters:

NameTypeDescriptionDefault
document

(id, data, tags)

required

Returns:

TypeDescription

embeddings vector

Source code in txtai/embeddings/base.py

  1. def transform(self, document):
  2. """
  3. Transforms document into an embeddings vector.
  4. Args:
  5. document: (id, data, tags)
  6. Returns:
  7. embeddings vector
  8. """
  9. return self.batchtransform([document])[0]

upsert(self, documents)

Runs an embeddings upsert operation. If the index exists, new data is appended to the index, existing data is updated. If the index doesn’t exist, this method runs a standard index operation.

Parameters:

NameTypeDescriptionDefault
documents

list of (id, data, tags)

required

Source code in txtai/embeddings/base.py

  1. def upsert(self, documents):
  2. """
  3. Runs an embeddings upsert operation. If the index exists, new data is
  4. appended to the index, existing data is updated. If the index doesn't exist,
  5. this method runs a standard index operation.
  6. Args:
  7. documents: list of (id, data, tags)
  8. """
  9. # Run standard insert if index doesn't exist or it has no records
  10. if not self.count():
  11. self.index(documents)
  12. return
  13. # Create transform action
  14. transform = Transform(self, Action.UPSERT)
  15. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  16. # Load documents into database and transform to vectors
  17. ids, _, embeddings = transform(documents, buffer)
  18. if ids:
  19. # Remove principal components from embeddings, if necessary
  20. if self.reducer:
  21. self.reducer(embeddings)
  22. # Normalize embeddings
  23. self.normalize(embeddings)
  24. # Append embeddings to the index
  25. self.ann.append(embeddings)
  26. # Save indexids-ids mapping for indexes with no database
  27. if not self.database:
  28. self.config["ids"] = self.config["ids"] + ids
  29. # Graph upsert, if necessary
  30. if self.graph:
  31. self.graph.upsert(Search(self, True), self.batchsimilarity)