Methods

Embeddings is the engine that delivers semantic search. Data is transformed into embeddings vectors where similar concepts will produce similar vectors. Indexes both large and small are built with these vectors. The indexes are used to find results that have the same meaning, not necessarily the same keywords.

Source code in txtai/embeddings/base.py

  1. class Embeddings:
  2. """
  3. Embeddings is the engine that delivers semantic search. Data is transformed into embeddings vectors where similar concepts
  4. will produce similar vectors. Indexes both large and small are built with these vectors. The indexes are used to find results
  5. that have the same meaning, not necessarily the same keywords.
  6. """
  7. # pylint: disable = W0231
  8. def __init__(self, config=None, models=None, **kwargs):
  9. """
  10. Creates a new embeddings index. Embeddings indexes are thread-safe for read operations but writes must be synchronized.
  11. Args:
  12. config: embeddings configuration
  13. models: models cache, used for model sharing between embeddings
  14. kwargs: additional configuration as keyword args
  15. """
  16. # Index configuration
  17. self.config = None
  18. # Dimensionality reduction - word vectors only
  19. self.reducer = None
  20. # Dense vector model - transforms data into similarity vectors
  21. self.model = None
  22. # Approximate nearest neighbor index
  23. self.ann = None
  24. # Document database
  25. self.database = None
  26. # Resolvable functions
  27. self.functions = None
  28. # Graph network
  29. self.graph = None
  30. # Sparse vectors
  31. self.scoring = None
  32. # Query model
  33. self.query = None
  34. # Index archive
  35. self.archive = None
  36. # Subindexes for this embeddings instance
  37. self.indexes = None
  38. # Models cache
  39. self.models = models
  40. # Merge configuration into single dictionary
  41. config = {**config, **kwargs} if config and kwargs else kwargs if kwargs else config
  42. # Set initial configuration
  43. self.configure(config)
  44. def score(self, documents):
  45. """
  46. Builds a term weighting scoring index. Only used by word vectors models.
  47. Args:
  48. documents: iterable of (id, data, tags), (id, data) or data
  49. """
  50. # Build scoring index for word vectors term weighting
  51. if self.isweighted():
  52. self.scoring.index(Stream(self)(documents))
  53. def index(self, documents, reindex=False):
  54. """
  55. Builds an embeddings index. This method overwrites an existing index.
  56. Args:
  57. documents: iterable of (id, data, tags), (id, data) or data
  58. reindex: if this is a reindex operation in which case database creation is skipped, defaults to False
  59. """
  60. # Initialize index
  61. self.initindex(reindex)
  62. # Create transform and stream
  63. transform = Transform(self, Action.REINDEX if reindex else Action.INDEX)
  64. stream = Stream(self, Action.REINDEX if reindex else Action.INDEX)
  65. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  66. # Load documents into database and transform to vectors
  67. ids, dimensions, embeddings = transform(stream(documents), buffer)
  68. if embeddings is not None:
  69. # Build LSA model (if enabled). Remove principal components from embeddings.
  70. if self.config.get("pca"):
  71. self.reducer = Reducer(embeddings, self.config["pca"])
  72. self.reducer(embeddings)
  73. # Normalize embeddings
  74. self.normalize(embeddings)
  75. # Save index dimensions
  76. self.config["dimensions"] = dimensions
  77. # Create approximate nearest neighbor index
  78. self.ann = self.createann()
  79. # Add embeddings to the index
  80. self.ann.index(embeddings)
  81. # Save indexids-ids mapping for indexes with no database, except when this is a reindex
  82. if ids and not reindex and not self.database:
  83. self.config["ids"] = ids
  84. # Index scoring, if necessary
  85. # This must occur before graph index in order to be available to the graph
  86. if self.issparse():
  87. self.scoring.index()
  88. # Index subindexes, if necessary
  89. if self.indexes:
  90. self.indexes.index()
  91. # Index graph, if necessary
  92. if self.graph:
  93. self.graph.index(Search(self, True), self.batchsimilarity)
  94. def upsert(self, documents):
  95. """
  96. Runs an embeddings upsert operation. If the index exists, new data is
  97. appended to the index, existing data is updated. If the index doesn't exist,
  98. this method runs a standard index operation.
  99. Args:
  100. documents: iterable of (id, data, tags), (id, data) or data
  101. """
  102. # Run standard insert if index doesn't exist or it has no records
  103. if not self.count():
  104. self.index(documents)
  105. return
  106. # Create transform and stream
  107. transform = Transform(self, Action.UPSERT)
  108. stream = Stream(self, Action.UPSERT)
  109. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  110. # Load documents into database and transform to vectors
  111. ids, _, embeddings = transform(stream(documents), buffer)
  112. if embeddings is not None:
  113. # Remove principal components from embeddings, if necessary
  114. if self.reducer:
  115. self.reducer(embeddings)
  116. # Normalize embeddings
  117. self.normalize(embeddings)
  118. # Append embeddings to the index
  119. self.ann.append(embeddings)
  120. # Save indexids-ids mapping for indexes with no database
  121. if ids and not self.database:
  122. self.config["ids"] = self.config["ids"] + ids
  123. # Scoring upsert, if necessary
  124. # This must occur before graph upsert in order to be available to the graph
  125. if self.issparse():
  126. self.scoring.upsert()
  127. # Subindexes upsert, if necessary
  128. if self.indexes:
  129. self.indexes.upsert()
  130. # Graph upsert, if necessary
  131. if self.graph:
  132. self.graph.upsert(Search(self, True), self.batchsimilarity)
  133. def delete(self, ids):
  134. """
  135. Deletes from an embeddings index. Returns list of ids deleted.
  136. Args:
  137. ids: list of ids to delete
  138. Returns:
  139. list of ids deleted
  140. """
  141. # List of internal indices for each candidate id to delete
  142. indices = []
  143. # List of deleted ids
  144. deletes = []
  145. if self.database:
  146. # Retrieve indexid-id mappings from database
  147. ids = self.database.ids(ids)
  148. # Parse out indices and ids to delete
  149. indices = [i for i, _ in ids]
  150. deletes = sorted(set(uid for _, uid in ids))
  151. # Delete ids from database
  152. self.database.delete(deletes)
  153. elif self.ann or self.scoring:
  154. # Lookup indexids from config for indexes with no database
  155. indexids = self.config["ids"]
  156. # Find existing ids
  157. for uid in ids:
  158. indices.extend([index for index, value in enumerate(indexids) if uid == value])
  159. # Clear config ids
  160. for index in indices:
  161. deletes.append(indexids[index])
  162. indexids[index] = None
  163. # Delete indices for all indexes and data stores
  164. if indices:
  165. # Delete ids from ann
  166. if self.isdense():
  167. self.ann.delete(indices)
  168. # Delete ids from scoring
  169. if self.issparse():
  170. self.scoring.delete(indices)
  171. # Delete ids from subindexes
  172. if self.indexes:
  173. self.indexes.delete(indices)
  174. # Delete ids from graph
  175. if self.graph:
  176. self.graph.delete(indices)
  177. return deletes
  178. def reindex(self, config=None, function=None, **kwargs):
  179. """
  180. Recreates embeddings index using config. This method only works if document content storage is enabled.
  181. Args:
  182. config: new config
  183. function: optional function to prepare content for indexing
  184. kwargs: additional configuration as keyword args
  185. """
  186. if self.database:
  187. # Merge configuration into single dictionary
  188. config = {**config, **kwargs} if config and kwargs else config if config else kwargs
  189. # Keep content and objects parameters to ensure database is preserved
  190. config["content"] = self.config["content"]
  191. if "objects" in self.config:
  192. config["objects"] = self.config["objects"]
  193. # Reset configuration
  194. self.configure(config)
  195. # Reset function references
  196. if self.functions:
  197. self.functions.reset()
  198. # Reindex
  199. if function:
  200. self.index(function(self.database.reindex(self.config)), True)
  201. else:
  202. self.index(self.database.reindex(self.config), True)
  203. def transform(self, document):
  204. """
  205. Transforms document into an embeddings vector.
  206. Args:
  207. documents: iterable of (id, data, tags), (id, data) or data
  208. Returns:
  209. embeddings vector
  210. """
  211. return self.batchtransform([document])[0]
  212. def batchtransform(self, documents, category=None):
  213. """
  214. Transforms documents into embeddings vectors.
  215. Args:
  216. documents: iterable of (id, data, tags), (id, data) or data
  217. category: category for instruction-based embeddings
  218. Returns:
  219. embeddings vectors
  220. """
  221. # Initialize default parameters, if necessary
  222. self.defaults()
  223. # Convert documents into sentence embeddings
  224. embeddings = self.model.batchtransform(Stream(self)(documents), category)
  225. # Reduce the dimensionality of the embeddings. Scale the embeddings using this
  226. # model to reduce the noise of common but less relevant terms.
  227. if self.reducer:
  228. self.reducer(embeddings)
  229. # Normalize embeddings
  230. self.normalize(embeddings)
  231. return embeddings
  232. def count(self):
  233. """
  234. Total number of elements in this embeddings index.
  235. Returns:
  236. number of elements in this embeddings index
  237. """
  238. if self.ann:
  239. return self.ann.count()
  240. if self.scoring:
  241. return self.scoring.count()
  242. if self.database:
  243. return self.database.count()
  244. if self.config.get("ids"):
  245. return len([uid for uid in self.config["ids"] if uid is not None])
  246. # Default to 0 when no suitable method found
  247. return 0
  248. def search(self, query, limit=None, weights=None, index=None):
  249. """
  250. Finds documents most similar to the input query. This method will run either an index search
  251. or an index + database search depending on if a database is available.
  252. Args:
  253. query: input query
  254. limit: maximum results
  255. weights: hybrid score weights, if applicable
  256. index: index name, if applicable
  257. Returns:
  258. list of (id, score) for index search, list of dict for an index + database search
  259. """
  260. results = self.batchsearch([query], limit, weights, index)
  261. return results[0] if results else results
  262. def batchsearch(self, queries, limit=None, weights=None, index=None):
  263. """
  264. Finds documents most similar to the input queries. This method will run either an index search
  265. or an index + database search depending on if a database is available.
  266. Args:
  267. queries: input queries
  268. limit: maximum results
  269. weights: hybrid score weights, if applicable
  270. index: index name, if applicable
  271. Returns:
  272. list of (id, score) per query for index search, list of dict per query for an index + database search
  273. """
  274. return Search(self)(queries, limit, weights, index)
  275. def similarity(self, query, data):
  276. """
  277. Computes the similarity between query and list of data. Returns a list of
  278. (id, score) sorted by highest score, where id is the index in data.
  279. Args:
  280. query: input query
  281. data: list of data
  282. Returns:
  283. list of (id, score)
  284. """
  285. return self.batchsimilarity([query], data)[0]
  286. def batchsimilarity(self, queries, data):
  287. """
  288. Computes the similarity between list of queries and list of data. Returns a list
  289. of (id, score) sorted by highest score per query, where id is the index in data.
  290. Args:
  291. queries: input queries
  292. data: list of data
  293. Returns:
  294. list of (id, score) per query
  295. """
  296. # Convert queries to embedding vectors
  297. queries = self.batchtransform(((None, query, None) for query in queries), "query")
  298. data = self.batchtransform(((None, row, None) for row in data), "data")
  299. # Dot product on normalized vectors is equal to cosine similarity
  300. scores = np.dot(queries, data.T).tolist()
  301. # Add index and sort desc based on score
  302. return [sorted(enumerate(score), key=lambda x: x[1], reverse=True) for score in scores]
  303. def explain(self, query, texts=None, limit=None):
  304. """
  305. Explains the importance of each input token in text for a query. This method requires either content to be enabled
  306. or texts to be provided.
  307. Args:
  308. query: input query
  309. texts: optional list of (text|list of tokens), otherwise runs search query
  310. limit: optional limit if texts is None
  311. Returns:
  312. list of dict per input text where a higher token scores represents higher importance relative to the query
  313. """
  314. results = self.batchexplain([query], texts, limit)
  315. return results[0] if results else results
  316. def batchexplain(self, queries, texts=None, limit=None):
  317. """
  318. Explains the importance of each input token in text for a list of queries. This method requires either content to be enabled
  319. or texts to be provided.
  320. Args:
  321. queries: input queries
  322. texts: optional list of (text|list of tokens), otherwise runs search queries
  323. limit: optional limit if texts is None
  324. Returns:
  325. list of dict per input text per query where a higher token scores represents higher importance relative to the query
  326. """
  327. return Explain(self)(queries, texts, limit)
  328. def terms(self, query):
  329. """
  330. Extracts keyword terms from a query.
  331. Args:
  332. query: input query
  333. Returns:
  334. query reduced down to keyword terms
  335. """
  336. return self.batchterms([query])[0]
  337. def batchterms(self, queries):
  338. """
  339. Extracts keyword terms from a list of queries.
  340. Args:
  341. queries: list of queries
  342. Returns:
  343. list of queries reduced down to keyword term strings
  344. """
  345. return Terms(self)(queries)
  346. def exists(self, path=None, cloud=None, **kwargs):
  347. """
  348. Checks if an index exists at path.
  349. Args:
  350. path: input path
  351. cloud: cloud storage configuration
  352. kwargs: additional configuration as keyword args
  353. Returns:
  354. True if index exists, False otherwise
  355. """
  356. # Check if this exists in a cloud instance
  357. cloud = self.createcloud(cloud=cloud, **kwargs)
  358. if cloud:
  359. return cloud.exists(path)
  360. # Check if this is an archive file and exists
  361. path, apath = self.checkarchive(path)
  362. if apath:
  363. return os.path.exists(apath)
  364. # Return true if path has a config or config.json file and an embeddings (dense) or scoring (sparse) file
  365. return (
  366. path
  367. and (os.path.exists(f"{path}/config") or os.path.exists(f"{path}/config.json"))
  368. and (os.path.exists(f"{path}/embeddings") or os.path.exists(f"{path}/scoring"))
  369. )
  370. def load(self, path=None, cloud=None, **kwargs):
  371. """
  372. Loads an existing index from path.
  373. Args:
  374. path: input path
  375. cloud: cloud storage configuration
  376. kwargs: additional configuration as keyword args
  377. """
  378. # Load from cloud, if configured
  379. cloud = self.createcloud(cloud=cloud, **kwargs)
  380. if cloud:
  381. path = cloud.load(path)
  382. # Check if this is an archive file and extract
  383. path, apath = self.checkarchive(path)
  384. if apath:
  385. self.archive.load(apath)
  386. # Load index configuration
  387. self.config = self.loadconfig(path)
  388. # Approximate nearest neighbor index - stores dense vectors
  389. self.ann = self.createann()
  390. if self.ann:
  391. self.ann.load(f"{path}/embeddings")
  392. # Dimensionality reduction model - word vectors only
  393. if self.config.get("pca"):
  394. self.reducer = Reducer()
  395. self.reducer.load(f"{path}/lsa")
  396. # Document database - stores document content
  397. self.database = self.createdatabase()
  398. if self.database:
  399. self.database.load(f"{path}/documents")
  400. # Sparse vectors - stores term sparse arrays
  401. self.scoring = self.createscoring()
  402. if self.scoring:
  403. self.scoring.load(f"{path}/scoring")
  404. # Subindexes
  405. self.indexes = self.createindexes()
  406. if self.indexes:
  407. self.indexes.load(f"{path}/indexes")
  408. # Graph network - stores relationships
  409. self.graph = self.creategraph()
  410. if self.graph:
  411. self.graph.load(f"{path}/graph")
  412. # Dense vectors - transforms data to embeddings vectors
  413. self.model = self.loadvectors()
  414. # Query model
  415. self.query = self.loadquery()
  416. def save(self, path, cloud=None, **kwargs):
  417. """
  418. Saves an index in a directory at path unless path ends with tar.gz, tar.bz2, tar.xz or zip.
  419. In those cases, the index is stored as a compressed file.
  420. Args:
  421. path: output path
  422. cloud: cloud storage configuration
  423. kwargs: additional configuration as keyword args
  424. """
  425. if self.config:
  426. # Check if this is an archive file
  427. path, apath = self.checkarchive(path)
  428. # Create output directory, if necessary
  429. os.makedirs(path, exist_ok=True)
  430. # Copy sentence vectors model
  431. if self.config.get("storevectors"):
  432. shutil.copyfile(self.config["path"], os.path.join(path, os.path.basename(self.config["path"])))
  433. self.config["path"] = os.path.basename(self.config["path"])
  434. # Save index configuration
  435. self.saveconfig(path)
  436. # Save approximate nearest neighbor index
  437. if self.ann:
  438. self.ann.save(f"{path}/embeddings")
  439. # Save dimensionality reduction model (word vectors only)
  440. if self.reducer:
  441. self.reducer.save(f"{path}/lsa")
  442. # Save document database
  443. if self.database:
  444. self.database.save(f"{path}/documents")
  445. # Save scoring index
  446. if self.scoring:
  447. self.scoring.save(f"{path}/scoring")
  448. # Save subindexes
  449. if self.indexes:
  450. self.indexes.save(f"{path}/indexes")
  451. # Save graph
  452. if self.graph:
  453. self.graph.save(f"{path}/graph")
  454. # If this is an archive, save it
  455. if apath:
  456. self.archive.save(apath)
  457. # Save to cloud, if configured
  458. cloud = self.createcloud(cloud=cloud, **kwargs)
  459. if cloud:
  460. cloud.save(apath if apath else path)
  461. def close(self):
  462. """
  463. Closes this embeddings index and frees all resources.
  464. """
  465. self.ann, self.config, self.graph, self.archive = None, None, None, None
  466. self.reducer, self.query, self.model, self.models = None, None, None, None
  467. # Close database connection if open
  468. if self.database:
  469. self.database.close()
  470. self.database, self.functions = None, None
  471. # Close scoring instance if open
  472. if self.scoring:
  473. self.scoring.close()
  474. self.scoring = None
  475. # Close indexes if open
  476. if self.indexes:
  477. self.indexes.close()
  478. self.indexes = None
  479. def info(self):
  480. """
  481. Prints the current embeddings index configuration.
  482. """
  483. if self.config:
  484. # Copy and edit config
  485. config = self.config.copy()
  486. # Remove ids array if present
  487. config.pop("ids", None)
  488. # Print configuration
  489. print(json.dumps(config, sort_keys=True, default=str, indent=2))
  490. def issparse(self):
  491. """
  492. Checks if this instance has an associated scoring instance with term indexing enabled.
  493. Returns:
  494. True if term index is enabled, False otherwise
  495. """
  496. return self.scoring and self.scoring.hasterms()
  497. def isdense(self):
  498. """
  499. Checks if this instance has an associated ANN instance.
  500. Returns:
  501. True if this instance has an associated ANN, False otherwise
  502. """
  503. return self.ann is not None
  504. def isweighted(self):
  505. """
  506. Checks if this instance has an associated scoring instance with term weighting enabled.
  507. Returns:
  508. True if term weighting is enabled, False otherwise
  509. """
  510. return self.scoring and not self.scoring.hasterms()
  511. def configure(self, config):
  512. """
  513. Sets the configuration for this embeddings index and loads config-driven models.
  514. Args:
  515. config: embeddings configuration
  516. """
  517. # Configuration
  518. self.config = config
  519. # Dimensionality reduction model
  520. self.reducer = None
  521. # Create scoring instance for word vectors term weighting
  522. scoring = self.config.get("scoring") if self.config else None
  523. self.scoring = self.createscoring() if scoring and (not isinstance(scoring, dict) or not scoring.get("terms")) else None
  524. # Dense vectors - transforms data to embeddings vectors
  525. self.model = self.loadvectors() if self.config else None
  526. # Query model
  527. self.query = self.loadquery() if self.config else None
  528. def initindex(self, reindex):
  529. """
  530. Initialize new index.
  531. Args:
  532. reindex: if this is a reindex operation in which case database creation is skipped, defaults to False
  533. """
  534. # Initialize default parameters, if necessary
  535. self.defaults()
  536. # Create document database, if necessary
  537. if not reindex:
  538. self.database = self.createdatabase()
  539. # Reset archive since this is a new index
  540. self.archive = None
  541. # Initialize ANN, will be created after index transformations complete
  542. self.ann = None
  543. # Create scoring only if term indexing is enabled
  544. scoring = self.config.get("scoring")
  545. if scoring and isinstance(scoring, dict) and self.config["scoring"].get("terms"):
  546. self.scoring = self.createscoring()
  547. # Create subindexes, if necessary
  548. self.indexes = self.createindexes()
  549. # Create graph, if necessary
  550. self.graph = self.creategraph()
  551. def defaults(self):
  552. """
  553. Apply default parameters to current configuration.
  554. Returns:
  555. configuration with default parameters set
  556. """
  557. self.config = self.config if self.config else {}
  558. # Expand sparse index shortcuts
  559. if not self.config.get("scoring") and any(self.config.get(key) for key in ["keyword", "hybrid"]):
  560. self.config["scoring"] = {"method": "bm25", "terms": True, "normalize": True}
  561. # Check if default model should be loaded
  562. if not self.model and self.defaultallowed():
  563. self.config["path"] = "sentence-transformers/all-MiniLM-L6-v2"
  564. # Load dense vectors model
  565. self.model = self.loadvectors()
  566. def defaultallowed(self):
  567. """
  568. Tests if this embeddings instance can use a default model if not otherwise provided.
  569. Returns:
  570. True if a default model is allowed, False otherwise
  571. """
  572. params = [("keyword", False), ("defaults", True)]
  573. return all(self.config.get(key, default) == default for key, default in params)
  574. def loadconfig(self, path):
  575. """
  576. Loads index configuration. This method supports both config pickle files and config.json files.
  577. Args:
  578. path: path to directory
  579. Returns:
  580. dict
  581. """
  582. # Configuration
  583. config = None
  584. # Determine if config is json or pickle
  585. jsonconfig = os.path.exists(f"{path}/config.json")
  586. # Set config file name
  587. name = "config.json" if jsonconfig else "config"
  588. # Load configuration
  589. with open(f"{path}/{name}", "r" if jsonconfig else "rb") as handle:
  590. config = json.load(handle) if jsonconfig else pickle.load(handle)
  591. # Build full path to embedding vectors file
  592. if config.get("storevectors"):
  593. config["path"] = os.path.join(path, config["path"])
  594. return config
  595. def saveconfig(self, path):
  596. """
  597. Saves index configuration. This method saves to JSON if possible, otherwise it falls back to pickle.
  598. Args:
  599. path: path to directory
  600. Returns:
  601. dict
  602. """
  603. # Default to pickle config
  604. jsonconfig = self.config.get("format", "pickle") == "json"
  605. # Set config file name
  606. name = "config.json" if jsonconfig else "config"
  607. # Write configuration
  608. with open(f"{path}/{name}", "w" if jsonconfig else "wb", encoding="utf-8" if jsonconfig else None) as handle:
  609. if jsonconfig:
  610. # Write config as JSON
  611. json.dump(self.config, handle, default=str, indent=2)
  612. else:
  613. # Write config as pickle format
  614. pickle.dump(self.config, handle, protocol=__pickle__)
  615. def loadvectors(self):
  616. """
  617. Loads a vector model set in config.
  618. Returns:
  619. vector model
  620. """
  621. # Create model cache if subindexes are enabled
  622. if "indexes" in self.config and self.models is None:
  623. self.models = {}
  624. # Model path
  625. path = self.config.get("path")
  626. # Check if model is cached
  627. if self.models and path in self.models:
  628. return self.models[path]
  629. # Load and store uncached model
  630. model = VectorsFactory.create(self.config, self.scoring)
  631. if self.models is not None and path:
  632. self.models[path] = model
  633. return model
  634. def loadquery(self):
  635. """
  636. Loads a query model set in config.
  637. Returns:
  638. query model
  639. """
  640. if "query" in self.config:
  641. return Query(**self.config["query"])
  642. return None
  643. def checkarchive(self, path):
  644. """
  645. Checks if path is an archive file.
  646. Args:
  647. path: path to check
  648. Returns:
  649. (working directory, current path) if this is an archive, original path otherwise
  650. """
  651. # Create archive instance, if necessary
  652. self.archive = ArchiveFactory.create()
  653. # Check if path is an archive file
  654. if self.archive.isarchive(path):
  655. # Return temporary archive working directory and original path
  656. return self.archive.path(), path
  657. return path, None
  658. def createcloud(self, **cloud):
  659. """
  660. Creates a cloud instance from config.
  661. Args:
  662. cloud: cloud configuration
  663. """
  664. # Merge keyword args and keys under the cloud parameter
  665. config = cloud
  666. if "cloud" in config and config["cloud"]:
  667. config.update(config.pop("cloud"))
  668. # Create cloud instance from config and return
  669. return CloudFactory.create(config) if config else None
  670. def createann(self):
  671. """
  672. Creates an ANN from config.
  673. Returns:
  674. new ANN, if enabled in config
  675. """
  676. return ANNFactory.create(self.config) if self.config.get("path") or self.defaultallowed() else None
  677. def createdatabase(self):
  678. """
  679. Creates a database from config. This method will also close any existing database connection.
  680. Returns:
  681. new database, if enabled in config
  682. """
  683. # Free existing database resources
  684. if self.database:
  685. self.database.close()
  686. config = self.config.copy()
  687. # Create references to callable functions
  688. self.functions = Functions(self) if "functions" in config else None
  689. if self.functions:
  690. config["functions"] = self.functions(config)
  691. # Create database from config and return
  692. return DatabaseFactory.create(config)
  693. def creategraph(self):
  694. """
  695. Creates a graph from config.
  696. Returns:
  697. new graph, if enabled in config
  698. """
  699. if "graph" in self.config:
  700. # Get or create graph configuration
  701. config = self.config["graph"] if self.config["graph"] else {}
  702. # Create configuration with custom columns, if necessary
  703. config = self.columns(config)
  704. return GraphFactory.create(config)
  705. return None
  706. def createindexes(self):
  707. """
  708. Creates subindexes from config.
  709. Returns:
  710. list of subindexes
  711. """
  712. # Load subindexes
  713. if "indexes" in self.config:
  714. indexes = {}
  715. for index, config in self.config["indexes"].items():
  716. # Create index with shared model cache
  717. indexes[index] = Embeddings(config, models=self.models)
  718. # Wrap as Indexes object
  719. return Indexes(self, indexes)
  720. return None
  721. def createscoring(self):
  722. """
  723. Creates a scoring from config.
  724. Returns:
  725. new scoring, if enabled in config
  726. """
  727. # Free existing resources
  728. if self.scoring:
  729. self.scoring.close()
  730. if "scoring" in self.config:
  731. # Expand scoring to a dictionary, if necessary
  732. config = self.config["scoring"]
  733. config = config if isinstance(config, dict) else {"method": config}
  734. # Create configuration with custom columns, if necessary
  735. config = self.columns(config)
  736. return ScoringFactory.create(config)
  737. return None
  738. def columns(self, config):
  739. """
  740. Adds custom text/object column information if it's provided.
  741. Args:
  742. config: input configuration
  743. Returns:
  744. config with column information added
  745. """
  746. # Add text/object columns if custom
  747. if "columns" in self.config:
  748. # Work on copy of configuration
  749. config = config.copy()
  750. # Copy columns to config
  751. config["columns"] = self.config["columns"]
  752. return config
  753. def normalize(self, embeddings):
  754. """
  755. Normalizes embeddings using L2 normalization. Operation applied directly on array.
  756. Args:
  757. embeddings: input embeddings matrix
  758. """
  759. # Calculation is different for matrices vs vectors
  760. if len(embeddings.shape) > 1:
  761. embeddings /= np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
  762. else:
  763. embeddings /= np.linalg.norm(embeddings)

__init__(self, config=None, models=None, **kwargs) special

Creates a new embeddings index. Embeddings indexes are thread-safe for read operations but writes must be synchronized.

Parameters:

NameTypeDescriptionDefault
config

embeddings configuration

None
models

models cache, used for model sharing between embeddings

None
kwargs

additional configuration as keyword args

{}

Source code in txtai/embeddings/base.py

  1. def __init__(self, config=None, models=None, **kwargs):
  2. """
  3. Creates a new embeddings index. Embeddings indexes are thread-safe for read operations but writes must be synchronized.
  4. Args:
  5. config: embeddings configuration
  6. models: models cache, used for model sharing between embeddings
  7. kwargs: additional configuration as keyword args
  8. """
  9. # Index configuration
  10. self.config = None
  11. # Dimensionality reduction - word vectors only
  12. self.reducer = None
  13. # Dense vector model - transforms data into similarity vectors
  14. self.model = None
  15. # Approximate nearest neighbor index
  16. self.ann = None
  17. # Document database
  18. self.database = None
  19. # Resolvable functions
  20. self.functions = None
  21. # Graph network
  22. self.graph = None
  23. # Sparse vectors
  24. self.scoring = None
  25. # Query model
  26. self.query = None
  27. # Index archive
  28. self.archive = None
  29. # Subindexes for this embeddings instance
  30. self.indexes = None
  31. # Models cache
  32. self.models = models
  33. # Merge configuration into single dictionary
  34. config = {**config, **kwargs} if config and kwargs else kwargs if kwargs else config
  35. # Set initial configuration
  36. self.configure(config)

batchexplain(self, queries, texts=None, limit=None)

Explains the importance of each input token in text for a list of queries. This method requires either content to be enabled or texts to be provided.

Parameters:

NameTypeDescriptionDefault
queries

input queries

required
texts

optional list of (text|list of tokens), otherwise runs search queries

None
limit

optional limit if texts is None

None

Returns:

TypeDescription

list of dict per input text per query where a higher token scores represents higher importance relative to the query

Source code in txtai/embeddings/base.py

  1. def batchexplain(self, queries, texts=None, limit=None):
  2. """
  3. Explains the importance of each input token in text for a list of queries. This method requires either content to be enabled
  4. or texts to be provided.
  5. Args:
  6. queries: input queries
  7. texts: optional list of (text|list of tokens), otherwise runs search queries
  8. limit: optional limit if texts is None
  9. Returns:
  10. list of dict per input text per query where a higher token scores represents higher importance relative to the query
  11. """
  12. return Explain(self)(queries, texts, limit)

batchsearch(self, queries, limit=None, weights=None, index=None)

Finds documents most similar to the input queries. This method will run either an index search or an index + database search depending on if a database is available.

Parameters:

NameTypeDescriptionDefault
queries

input queries

required
limit

maximum results

None
weights

hybrid score weights, if applicable

None
index

index name, if applicable

None

Returns:

TypeDescription

list of (id, score) per query for index search, list of dict per query for an index + database search

Source code in txtai/embeddings/base.py

  1. def batchsearch(self, queries, limit=None, weights=None, index=None):
  2. """
  3. Finds documents most similar to the input queries. This method will run either an index search
  4. or an index + database search depending on if a database is available.
  5. Args:
  6. queries: input queries
  7. limit: maximum results
  8. weights: hybrid score weights, if applicable
  9. index: index name, if applicable
  10. Returns:
  11. list of (id, score) per query for index search, list of dict per query for an index + database search
  12. """
  13. return Search(self)(queries, limit, weights, index)

batchsimilarity(self, queries, data)

Computes the similarity between list of queries and list of data. Returns a list of (id, score) sorted by highest score per query, where id is the index in data.

Parameters:

NameTypeDescriptionDefault
queries

input queries

required
data

list of data

required

Returns:

TypeDescription

list of (id, score) per query

Source code in txtai/embeddings/base.py

  1. def batchsimilarity(self, queries, data):
  2. """
  3. Computes the similarity between list of queries and list of data. Returns a list
  4. of (id, score) sorted by highest score per query, where id is the index in data.
  5. Args:
  6. queries: input queries
  7. data: list of data
  8. Returns:
  9. list of (id, score) per query
  10. """
  11. # Convert queries to embedding vectors
  12. queries = self.batchtransform(((None, query, None) for query in queries), "query")
  13. data = self.batchtransform(((None, row, None) for row in data), "data")
  14. # Dot product on normalized vectors is equal to cosine similarity
  15. scores = np.dot(queries, data.T).tolist()
  16. # Add index and sort desc based on score
  17. return [sorted(enumerate(score), key=lambda x: x[1], reverse=True) for score in scores]

batchterms(self, queries)

Extracts keyword terms from a list of queries.

Parameters:

NameTypeDescriptionDefault
queries

list of queries

required

Returns:

TypeDescription

list of queries reduced down to keyword term strings

Source code in txtai/embeddings/base.py

  1. def batchterms(self, queries):
  2. """
  3. Extracts keyword terms from a list of queries.
  4. Args:
  5. queries: list of queries
  6. Returns:
  7. list of queries reduced down to keyword term strings
  8. """
  9. return Terms(self)(queries)

batchtransform(self, documents, category=None)

Transforms documents into embeddings vectors.

Parameters:

NameTypeDescriptionDefault
documents

iterable of (id, data, tags), (id, data) or data

required
category

category for instruction-based embeddings

None

Returns:

TypeDescription

embeddings vectors

Source code in txtai/embeddings/base.py

  1. def batchtransform(self, documents, category=None):
  2. """
  3. Transforms documents into embeddings vectors.
  4. Args:
  5. documents: iterable of (id, data, tags), (id, data) or data
  6. category: category for instruction-based embeddings
  7. Returns:
  8. embeddings vectors
  9. """
  10. # Initialize default parameters, if necessary
  11. self.defaults()
  12. # Convert documents into sentence embeddings
  13. embeddings = self.model.batchtransform(Stream(self)(documents), category)
  14. # Reduce the dimensionality of the embeddings. Scale the embeddings using this
  15. # model to reduce the noise of common but less relevant terms.
  16. if self.reducer:
  17. self.reducer(embeddings)
  18. # Normalize embeddings
  19. self.normalize(embeddings)
  20. return embeddings

checkarchive(self, path)

Checks if path is an archive file.

Parameters:

NameTypeDescriptionDefault
path

path to check

required

Returns:

TypeDescription

(working directory, current path) if this is an archive, original path otherwise

Source code in txtai/embeddings/base.py

  1. def checkarchive(self, path):
  2. """
  3. Checks if path is an archive file.
  4. Args:
  5. path: path to check
  6. Returns:
  7. (working directory, current path) if this is an archive, original path otherwise
  8. """
  9. # Create archive instance, if necessary
  10. self.archive = ArchiveFactory.create()
  11. # Check if path is an archive file
  12. if self.archive.isarchive(path):
  13. # Return temporary archive working directory and original path
  14. return self.archive.path(), path
  15. return path, None

close(self)

Closes this embeddings index and frees all resources.

Source code in txtai/embeddings/base.py

  1. def close(self):
  2. """
  3. Closes this embeddings index and frees all resources.
  4. """
  5. self.ann, self.config, self.graph, self.archive = None, None, None, None
  6. self.reducer, self.query, self.model, self.models = None, None, None, None
  7. # Close database connection if open
  8. if self.database:
  9. self.database.close()
  10. self.database, self.functions = None, None
  11. # Close scoring instance if open
  12. if self.scoring:
  13. self.scoring.close()
  14. self.scoring = None
  15. # Close indexes if open
  16. if self.indexes:
  17. self.indexes.close()
  18. self.indexes = None

columns(self, config)

Adds custom text/object column information if it’s provided.

Parameters:

NameTypeDescriptionDefault
config

input configuration

required

Returns:

TypeDescription

config with column information added

Source code in txtai/embeddings/base.py

  1. def columns(self, config):
  2. """
  3. Adds custom text/object column information if it's provided.
  4. Args:
  5. config: input configuration
  6. Returns:
  7. config with column information added
  8. """
  9. # Add text/object columns if custom
  10. if "columns" in self.config:
  11. # Work on copy of configuration
  12. config = config.copy()
  13. # Copy columns to config
  14. config["columns"] = self.config["columns"]
  15. return config

configure(self, config)

Sets the configuration for this embeddings index and loads config-driven models.

Parameters:

NameTypeDescriptionDefault
config

embeddings configuration

required

Source code in txtai/embeddings/base.py

  1. def configure(self, config):
  2. """
  3. Sets the configuration for this embeddings index and loads config-driven models.
  4. Args:
  5. config: embeddings configuration
  6. """
  7. # Configuration
  8. self.config = config
  9. # Dimensionality reduction model
  10. self.reducer = None
  11. # Create scoring instance for word vectors term weighting
  12. scoring = self.config.get("scoring") if self.config else None
  13. self.scoring = self.createscoring() if scoring and (not isinstance(scoring, dict) or not scoring.get("terms")) else None
  14. # Dense vectors - transforms data to embeddings vectors
  15. self.model = self.loadvectors() if self.config else None
  16. # Query model
  17. self.query = self.loadquery() if self.config else None

count(self)

Total number of elements in this embeddings index.

Returns:

TypeDescription

number of elements in this embeddings index

Source code in txtai/embeddings/base.py

  1. def count(self):
  2. """
  3. Total number of elements in this embeddings index.
  4. Returns:
  5. number of elements in this embeddings index
  6. """
  7. if self.ann:
  8. return self.ann.count()
  9. if self.scoring:
  10. return self.scoring.count()
  11. if self.database:
  12. return self.database.count()
  13. if self.config.get("ids"):
  14. return len([uid for uid in self.config["ids"] if uid is not None])
  15. # Default to 0 when no suitable method found
  16. return 0

createann(self)

Creates an ANN from config.

Returns:

TypeDescription

new ANN, if enabled in config

Source code in txtai/embeddings/base.py

  1. def createann(self):
  2. """
  3. Creates an ANN from config.
  4. Returns:
  5. new ANN, if enabled in config
  6. """
  7. return ANNFactory.create(self.config) if self.config.get("path") or self.defaultallowed() else None

createcloud(self, **cloud)

Creates a cloud instance from config.

Parameters:

NameTypeDescriptionDefault
cloud

cloud configuration

{}

Source code in txtai/embeddings/base.py

  1. def createcloud(self, **cloud):
  2. """
  3. Creates a cloud instance from config.
  4. Args:
  5. cloud: cloud configuration
  6. """
  7. # Merge keyword args and keys under the cloud parameter
  8. config = cloud
  9. if "cloud" in config and config["cloud"]:
  10. config.update(config.pop("cloud"))
  11. # Create cloud instance from config and return
  12. return CloudFactory.create(config) if config else None

createdatabase(self)

Creates a database from config. This method will also close any existing database connection.

Returns:

TypeDescription

new database, if enabled in config

Source code in txtai/embeddings/base.py

  1. def createdatabase(self):
  2. """
  3. Creates a database from config. This method will also close any existing database connection.
  4. Returns:
  5. new database, if enabled in config
  6. """
  7. # Free existing database resources
  8. if self.database:
  9. self.database.close()
  10. config = self.config.copy()
  11. # Create references to callable functions
  12. self.functions = Functions(self) if "functions" in config else None
  13. if self.functions:
  14. config["functions"] = self.functions(config)
  15. # Create database from config and return
  16. return DatabaseFactory.create(config)

creategraph(self)

Creates a graph from config.

Returns:

TypeDescription

new graph, if enabled in config

Source code in txtai/embeddings/base.py

  1. def creategraph(self):
  2. """
  3. Creates a graph from config.
  4. Returns:
  5. new graph, if enabled in config
  6. """
  7. if "graph" in self.config:
  8. # Get or create graph configuration
  9. config = self.config["graph"] if self.config["graph"] else {}
  10. # Create configuration with custom columns, if necessary
  11. config = self.columns(config)
  12. return GraphFactory.create(config)
  13. return None

createindexes(self)

Creates subindexes from config.

Returns:

TypeDescription

list of subindexes

Source code in txtai/embeddings/base.py

  1. def createindexes(self):
  2. """
  3. Creates subindexes from config.
  4. Returns:
  5. list of subindexes
  6. """
  7. # Load subindexes
  8. if "indexes" in self.config:
  9. indexes = {}
  10. for index, config in self.config["indexes"].items():
  11. # Create index with shared model cache
  12. indexes[index] = Embeddings(config, models=self.models)
  13. # Wrap as Indexes object
  14. return Indexes(self, indexes)
  15. return None

createscoring(self)

Creates a scoring from config.

Returns:

TypeDescription

new scoring, if enabled in config

Source code in txtai/embeddings/base.py

  1. def createscoring(self):
  2. """
  3. Creates a scoring from config.
  4. Returns:
  5. new scoring, if enabled in config
  6. """
  7. # Free existing resources
  8. if self.scoring:
  9. self.scoring.close()
  10. if "scoring" in self.config:
  11. # Expand scoring to a dictionary, if necessary
  12. config = self.config["scoring"]
  13. config = config if isinstance(config, dict) else {"method": config}
  14. # Create configuration with custom columns, if necessary
  15. config = self.columns(config)
  16. return ScoringFactory.create(config)
  17. return None

defaultallowed(self)

Tests if this embeddings instance can use a default model if not otherwise provided.

Returns:

TypeDescription

True if a default model is allowed, False otherwise

Source code in txtai/embeddings/base.py

  1. def defaultallowed(self):
  2. """
  3. Tests if this embeddings instance can use a default model if not otherwise provided.
  4. Returns:
  5. True if a default model is allowed, False otherwise
  6. """
  7. params = [("keyword", False), ("defaults", True)]
  8. return all(self.config.get(key, default) == default for key, default in params)

defaults(self)

Apply default parameters to current configuration.

Returns:

TypeDescription

configuration with default parameters set

Source code in txtai/embeddings/base.py

  1. def defaults(self):
  2. """
  3. Apply default parameters to current configuration.
  4. Returns:
  5. configuration with default parameters set
  6. """
  7. self.config = self.config if self.config else {}
  8. # Expand sparse index shortcuts
  9. if not self.config.get("scoring") and any(self.config.get(key) for key in ["keyword", "hybrid"]):
  10. self.config["scoring"] = {"method": "bm25", "terms": True, "normalize": True}
  11. # Check if default model should be loaded
  12. if not self.model and self.defaultallowed():
  13. self.config["path"] = "sentence-transformers/all-MiniLM-L6-v2"
  14. # Load dense vectors model
  15. self.model = self.loadvectors()

delete(self, ids)

Deletes from an embeddings index. Returns list of ids deleted.

Parameters:

NameTypeDescriptionDefault
ids

list of ids to delete

required

Returns:

TypeDescription

list of ids deleted

Source code in txtai/embeddings/base.py

  1. def delete(self, ids):
  2. """
  3. Deletes from an embeddings index. Returns list of ids deleted.
  4. Args:
  5. ids: list of ids to delete
  6. Returns:
  7. list of ids deleted
  8. """
  9. # List of internal indices for each candidate id to delete
  10. indices = []
  11. # List of deleted ids
  12. deletes = []
  13. if self.database:
  14. # Retrieve indexid-id mappings from database
  15. ids = self.database.ids(ids)
  16. # Parse out indices and ids to delete
  17. indices = [i for i, _ in ids]
  18. deletes = sorted(set(uid for _, uid in ids))
  19. # Delete ids from database
  20. self.database.delete(deletes)
  21. elif self.ann or self.scoring:
  22. # Lookup indexids from config for indexes with no database
  23. indexids = self.config["ids"]
  24. # Find existing ids
  25. for uid in ids:
  26. indices.extend([index for index, value in enumerate(indexids) if uid == value])
  27. # Clear config ids
  28. for index in indices:
  29. deletes.append(indexids[index])
  30. indexids[index] = None
  31. # Delete indices for all indexes and data stores
  32. if indices:
  33. # Delete ids from ann
  34. if self.isdense():
  35. self.ann.delete(indices)
  36. # Delete ids from scoring
  37. if self.issparse():
  38. self.scoring.delete(indices)
  39. # Delete ids from subindexes
  40. if self.indexes:
  41. self.indexes.delete(indices)
  42. # Delete ids from graph
  43. if self.graph:
  44. self.graph.delete(indices)
  45. return deletes

exists(self, path=None, cloud=None, **kwargs)

Checks if an index exists at path.

Parameters:

NameTypeDescriptionDefault
path

input path

None
cloud

cloud storage configuration

None
kwargs

additional configuration as keyword args

{}

Returns:

TypeDescription

True if index exists, False otherwise

Source code in txtai/embeddings/base.py

  1. def exists(self, path=None, cloud=None, **kwargs):
  2. """
  3. Checks if an index exists at path.
  4. Args:
  5. path: input path
  6. cloud: cloud storage configuration
  7. kwargs: additional configuration as keyword args
  8. Returns:
  9. True if index exists, False otherwise
  10. """
  11. # Check if this exists in a cloud instance
  12. cloud = self.createcloud(cloud=cloud, **kwargs)
  13. if cloud:
  14. return cloud.exists(path)
  15. # Check if this is an archive file and exists
  16. path, apath = self.checkarchive(path)
  17. if apath:
  18. return os.path.exists(apath)
  19. # Return true if path has a config or config.json file and an embeddings (dense) or scoring (sparse) file
  20. return (
  21. path
  22. and (os.path.exists(f"{path}/config") or os.path.exists(f"{path}/config.json"))
  23. and (os.path.exists(f"{path}/embeddings") or os.path.exists(f"{path}/scoring"))
  24. )

explain(self, query, texts=None, limit=None)

Explains the importance of each input token in text for a query. This method requires either content to be enabled or texts to be provided.

Parameters:

NameTypeDescriptionDefault
query

input query

required
texts

optional list of (text|list of tokens), otherwise runs search query

None
limit

optional limit if texts is None

None

Returns:

TypeDescription

list of dict per input text where a higher token scores represents higher importance relative to the query

Source code in txtai/embeddings/base.py

  1. def explain(self, query, texts=None, limit=None):
  2. """
  3. Explains the importance of each input token in text for a query. This method requires either content to be enabled
  4. or texts to be provided.
  5. Args:
  6. query: input query
  7. texts: optional list of (text|list of tokens), otherwise runs search query
  8. limit: optional limit if texts is None
  9. Returns:
  10. list of dict per input text where a higher token scores represents higher importance relative to the query
  11. """
  12. results = self.batchexplain([query], texts, limit)
  13. return results[0] if results else results

index(self, documents, reindex=False)

Builds an embeddings index. This method overwrites an existing index.

Parameters:

NameTypeDescriptionDefault
documents

iterable of (id, data, tags), (id, data) or data

required
reindex

if this is a reindex operation in which case database creation is skipped, defaults to False

False

Source code in txtai/embeddings/base.py

  1. def index(self, documents, reindex=False):
  2. """
  3. Builds an embeddings index. This method overwrites an existing index.
  4. Args:
  5. documents: iterable of (id, data, tags), (id, data) or data
  6. reindex: if this is a reindex operation in which case database creation is skipped, defaults to False
  7. """
  8. # Initialize index
  9. self.initindex(reindex)
  10. # Create transform and stream
  11. transform = Transform(self, Action.REINDEX if reindex else Action.INDEX)
  12. stream = Stream(self, Action.REINDEX if reindex else Action.INDEX)
  13. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  14. # Load documents into database and transform to vectors
  15. ids, dimensions, embeddings = transform(stream(documents), buffer)
  16. if embeddings is not None:
  17. # Build LSA model (if enabled). Remove principal components from embeddings.
  18. if self.config.get("pca"):
  19. self.reducer = Reducer(embeddings, self.config["pca"])
  20. self.reducer(embeddings)
  21. # Normalize embeddings
  22. self.normalize(embeddings)
  23. # Save index dimensions
  24. self.config["dimensions"] = dimensions
  25. # Create approximate nearest neighbor index
  26. self.ann = self.createann()
  27. # Add embeddings to the index
  28. self.ann.index(embeddings)
  29. # Save indexids-ids mapping for indexes with no database, except when this is a reindex
  30. if ids and not reindex and not self.database:
  31. self.config["ids"] = ids
  32. # Index scoring, if necessary
  33. # This must occur before graph index in order to be available to the graph
  34. if self.issparse():
  35. self.scoring.index()
  36. # Index subindexes, if necessary
  37. if self.indexes:
  38. self.indexes.index()
  39. # Index graph, if necessary
  40. if self.graph:
  41. self.graph.index(Search(self, True), self.batchsimilarity)

info(self)

Prints the current embeddings index configuration.

Source code in txtai/embeddings/base.py

  1. def info(self):
  2. """
  3. Prints the current embeddings index configuration.
  4. """
  5. if self.config:
  6. # Copy and edit config
  7. config = self.config.copy()
  8. # Remove ids array if present
  9. config.pop("ids", None)
  10. # Print configuration
  11. print(json.dumps(config, sort_keys=True, default=str, indent=2))

initindex(self, reindex)

Initialize new index.

Parameters:

NameTypeDescriptionDefault
reindex

if this is a reindex operation in which case database creation is skipped, defaults to False

required

Source code in txtai/embeddings/base.py

  1. def initindex(self, reindex):
  2. """
  3. Initialize new index.
  4. Args:
  5. reindex: if this is a reindex operation in which case database creation is skipped, defaults to False
  6. """
  7. # Initialize default parameters, if necessary
  8. self.defaults()
  9. # Create document database, if necessary
  10. if not reindex:
  11. self.database = self.createdatabase()
  12. # Reset archive since this is a new index
  13. self.archive = None
  14. # Initialize ANN, will be created after index transformations complete
  15. self.ann = None
  16. # Create scoring only if term indexing is enabled
  17. scoring = self.config.get("scoring")
  18. if scoring and isinstance(scoring, dict) and self.config["scoring"].get("terms"):
  19. self.scoring = self.createscoring()
  20. # Create subindexes, if necessary
  21. self.indexes = self.createindexes()
  22. # Create graph, if necessary
  23. self.graph = self.creategraph()

isdense(self)

Checks if this instance has an associated ANN instance.

Returns:

TypeDescription

True if this instance has an associated ANN, False otherwise

Source code in txtai/embeddings/base.py

  1. def isdense(self):
  2. """
  3. Checks if this instance has an associated ANN instance.
  4. Returns:
  5. True if this instance has an associated ANN, False otherwise
  6. """
  7. return self.ann is not None

issparse(self)

Checks if this instance has an associated scoring instance with term indexing enabled.

Returns:

TypeDescription

True if term index is enabled, False otherwise

Source code in txtai/embeddings/base.py

  1. def issparse(self):
  2. """
  3. Checks if this instance has an associated scoring instance with term indexing enabled.
  4. Returns:
  5. True if term index is enabled, False otherwise
  6. """
  7. return self.scoring and self.scoring.hasterms()

isweighted(self)

Checks if this instance has an associated scoring instance with term weighting enabled.

Returns:

TypeDescription

True if term weighting is enabled, False otherwise

Source code in txtai/embeddings/base.py

  1. def isweighted(self):
  2. """
  3. Checks if this instance has an associated scoring instance with term weighting enabled.
  4. Returns:
  5. True if term weighting is enabled, False otherwise
  6. """
  7. return self.scoring and not self.scoring.hasterms()

load(self, path=None, cloud=None, **kwargs)

Loads an existing index from path.

Parameters:

NameTypeDescriptionDefault
path

input path

None
cloud

cloud storage configuration

None
kwargs

additional configuration as keyword args

{}

Source code in txtai/embeddings/base.py

  1. def load(self, path=None, cloud=None, **kwargs):
  2. """
  3. Loads an existing index from path.
  4. Args:
  5. path: input path
  6. cloud: cloud storage configuration
  7. kwargs: additional configuration as keyword args
  8. """
  9. # Load from cloud, if configured
  10. cloud = self.createcloud(cloud=cloud, **kwargs)
  11. if cloud:
  12. path = cloud.load(path)
  13. # Check if this is an archive file and extract
  14. path, apath = self.checkarchive(path)
  15. if apath:
  16. self.archive.load(apath)
  17. # Load index configuration
  18. self.config = self.loadconfig(path)
  19. # Approximate nearest neighbor index - stores dense vectors
  20. self.ann = self.createann()
  21. if self.ann:
  22. self.ann.load(f"{path}/embeddings")
  23. # Dimensionality reduction model - word vectors only
  24. if self.config.get("pca"):
  25. self.reducer = Reducer()
  26. self.reducer.load(f"{path}/lsa")
  27. # Document database - stores document content
  28. self.database = self.createdatabase()
  29. if self.database:
  30. self.database.load(f"{path}/documents")
  31. # Sparse vectors - stores term sparse arrays
  32. self.scoring = self.createscoring()
  33. if self.scoring:
  34. self.scoring.load(f"{path}/scoring")
  35. # Subindexes
  36. self.indexes = self.createindexes()
  37. if self.indexes:
  38. self.indexes.load(f"{path}/indexes")
  39. # Graph network - stores relationships
  40. self.graph = self.creategraph()
  41. if self.graph:
  42. self.graph.load(f"{path}/graph")
  43. # Dense vectors - transforms data to embeddings vectors
  44. self.model = self.loadvectors()
  45. # Query model
  46. self.query = self.loadquery()

loadconfig(self, path)

Loads index configuration. This method supports both config pickle files and config.json files.

Parameters:

NameTypeDescriptionDefault
path

path to directory

required

Returns:

TypeDescription

dict

Source code in txtai/embeddings/base.py

  1. def loadconfig(self, path):
  2. """
  3. Loads index configuration. This method supports both config pickle files and config.json files.
  4. Args:
  5. path: path to directory
  6. Returns:
  7. dict
  8. """
  9. # Configuration
  10. config = None
  11. # Determine if config is json or pickle
  12. jsonconfig = os.path.exists(f"{path}/config.json")
  13. # Set config file name
  14. name = "config.json" if jsonconfig else "config"
  15. # Load configuration
  16. with open(f"{path}/{name}", "r" if jsonconfig else "rb") as handle:
  17. config = json.load(handle) if jsonconfig else pickle.load(handle)
  18. # Build full path to embedding vectors file
  19. if config.get("storevectors"):
  20. config["path"] = os.path.join(path, config["path"])
  21. return config

loadquery(self)

Loads a query model set in config.

Returns:

TypeDescription

query model

Source code in txtai/embeddings/base.py

  1. def loadquery(self):
  2. """
  3. Loads a query model set in config.
  4. Returns:
  5. query model
  6. """
  7. if "query" in self.config:
  8. return Query(**self.config["query"])
  9. return None

loadvectors(self)

Loads a vector model set in config.

Returns:

TypeDescription

vector model

Source code in txtai/embeddings/base.py

  1. def loadvectors(self):
  2. """
  3. Loads a vector model set in config.
  4. Returns:
  5. vector model
  6. """
  7. # Create model cache if subindexes are enabled
  8. if "indexes" in self.config and self.models is None:
  9. self.models = {}
  10. # Model path
  11. path = self.config.get("path")
  12. # Check if model is cached
  13. if self.models and path in self.models:
  14. return self.models[path]
  15. # Load and store uncached model
  16. model = VectorsFactory.create(self.config, self.scoring)
  17. if self.models is not None and path:
  18. self.models[path] = model
  19. return model

normalize(self, embeddings)

Normalizes embeddings using L2 normalization. Operation applied directly on array.

Parameters:

NameTypeDescriptionDefault
embeddings

input embeddings matrix

required

Source code in txtai/embeddings/base.py

  1. def normalize(self, embeddings):
  2. """
  3. Normalizes embeddings using L2 normalization. Operation applied directly on array.
  4. Args:
  5. embeddings: input embeddings matrix
  6. """
  7. # Calculation is different for matrices vs vectors
  8. if len(embeddings.shape) > 1:
  9. embeddings /= np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
  10. else:
  11. embeddings /= np.linalg.norm(embeddings)

reindex(self, config=None, function=None, **kwargs)

Recreates embeddings index using config. This method only works if document content storage is enabled.

Parameters:

NameTypeDescriptionDefault
config

new config

None
function

optional function to prepare content for indexing

None
kwargs

additional configuration as keyword args

{}

Source code in txtai/embeddings/base.py

  1. def reindex(self, config=None, function=None, **kwargs):
  2. """
  3. Recreates embeddings index using config. This method only works if document content storage is enabled.
  4. Args:
  5. config: new config
  6. function: optional function to prepare content for indexing
  7. kwargs: additional configuration as keyword args
  8. """
  9. if self.database:
  10. # Merge configuration into single dictionary
  11. config = {**config, **kwargs} if config and kwargs else config if config else kwargs
  12. # Keep content and objects parameters to ensure database is preserved
  13. config["content"] = self.config["content"]
  14. if "objects" in self.config:
  15. config["objects"] = self.config["objects"]
  16. # Reset configuration
  17. self.configure(config)
  18. # Reset function references
  19. if self.functions:
  20. self.functions.reset()
  21. # Reindex
  22. if function:
  23. self.index(function(self.database.reindex(self.config)), True)
  24. else:
  25. self.index(self.database.reindex(self.config), True)

save(self, path, cloud=None, **kwargs)

Saves an index in a directory at path unless path ends with tar.gz, tar.bz2, tar.xz or zip. In those cases, the index is stored as a compressed file.

Parameters:

NameTypeDescriptionDefault
path

output path

required
cloud

cloud storage configuration

None
kwargs

additional configuration as keyword args

{}

Source code in txtai/embeddings/base.py

  1. def save(self, path, cloud=None, **kwargs):
  2. """
  3. Saves an index in a directory at path unless path ends with tar.gz, tar.bz2, tar.xz or zip.
  4. In those cases, the index is stored as a compressed file.
  5. Args:
  6. path: output path
  7. cloud: cloud storage configuration
  8. kwargs: additional configuration as keyword args
  9. """
  10. if self.config:
  11. # Check if this is an archive file
  12. path, apath = self.checkarchive(path)
  13. # Create output directory, if necessary
  14. os.makedirs(path, exist_ok=True)
  15. # Copy sentence vectors model
  16. if self.config.get("storevectors"):
  17. shutil.copyfile(self.config["path"], os.path.join(path, os.path.basename(self.config["path"])))
  18. self.config["path"] = os.path.basename(self.config["path"])
  19. # Save index configuration
  20. self.saveconfig(path)
  21. # Save approximate nearest neighbor index
  22. if self.ann:
  23. self.ann.save(f"{path}/embeddings")
  24. # Save dimensionality reduction model (word vectors only)
  25. if self.reducer:
  26. self.reducer.save(f"{path}/lsa")
  27. # Save document database
  28. if self.database:
  29. self.database.save(f"{path}/documents")
  30. # Save scoring index
  31. if self.scoring:
  32. self.scoring.save(f"{path}/scoring")
  33. # Save subindexes
  34. if self.indexes:
  35. self.indexes.save(f"{path}/indexes")
  36. # Save graph
  37. if self.graph:
  38. self.graph.save(f"{path}/graph")
  39. # If this is an archive, save it
  40. if apath:
  41. self.archive.save(apath)
  42. # Save to cloud, if configured
  43. cloud = self.createcloud(cloud=cloud, **kwargs)
  44. if cloud:
  45. cloud.save(apath if apath else path)

saveconfig(self, path)

Saves index configuration. This method saves to JSON if possible, otherwise it falls back to pickle.

Parameters:

NameTypeDescriptionDefault
path

path to directory

required

Returns:

TypeDescription

dict

Source code in txtai/embeddings/base.py

  1. def saveconfig(self, path):
  2. """
  3. Saves index configuration. This method saves to JSON if possible, otherwise it falls back to pickle.
  4. Args:
  5. path: path to directory
  6. Returns:
  7. dict
  8. """
  9. # Default to pickle config
  10. jsonconfig = self.config.get("format", "pickle") == "json"
  11. # Set config file name
  12. name = "config.json" if jsonconfig else "config"
  13. # Write configuration
  14. with open(f"{path}/{name}", "w" if jsonconfig else "wb", encoding="utf-8" if jsonconfig else None) as handle:
  15. if jsonconfig:
  16. # Write config as JSON
  17. json.dump(self.config, handle, default=str, indent=2)
  18. else:
  19. # Write config as pickle format
  20. pickle.dump(self.config, handle, protocol=__pickle__)

score(self, documents)

Builds a term weighting scoring index. Only used by word vectors models.

Parameters:

NameTypeDescriptionDefault
documents

iterable of (id, data, tags), (id, data) or data

required

Source code in txtai/embeddings/base.py

  1. def score(self, documents):
  2. """
  3. Builds a term weighting scoring index. Only used by word vectors models.
  4. Args:
  5. documents: iterable of (id, data, tags), (id, data) or data
  6. """
  7. # Build scoring index for word vectors term weighting
  8. if self.isweighted():
  9. self.scoring.index(Stream(self)(documents))

search(self, query, limit=None, weights=None, index=None)

Finds documents most similar to the input query. This method will run either an index search or an index + database search depending on if a database is available.

Parameters:

NameTypeDescriptionDefault
query

input query

required
limit

maximum results

None
weights

hybrid score weights, if applicable

None
index

index name, if applicable

None

Returns:

TypeDescription

list of (id, score) for index search, list of dict for an index + database search

Source code in txtai/embeddings/base.py

  1. def search(self, query, limit=None, weights=None, index=None):
  2. """
  3. Finds documents most similar to the input query. This method will run either an index search
  4. or an index + database search depending on if a database is available.
  5. Args:
  6. query: input query
  7. limit: maximum results
  8. weights: hybrid score weights, if applicable
  9. index: index name, if applicable
  10. Returns:
  11. list of (id, score) for index search, list of dict for an index + database search
  12. """
  13. results = self.batchsearch([query], limit, weights, index)
  14. return results[0] if results else results

similarity(self, query, data)

Computes the similarity between query and list of data. Returns a list of (id, score) sorted by highest score, where id is the index in data.

Parameters:

NameTypeDescriptionDefault
query

input query

required
data

list of data

required

Returns:

TypeDescription

list of (id, score)

Source code in txtai/embeddings/base.py

  1. def similarity(self, query, data):
  2. """
  3. Computes the similarity between query and list of data. Returns a list of
  4. (id, score) sorted by highest score, where id is the index in data.
  5. Args:
  6. query: input query
  7. data: list of data
  8. Returns:
  9. list of (id, score)
  10. """
  11. return self.batchsimilarity([query], data)[0]

terms(self, query)

Extracts keyword terms from a query.

Parameters:

NameTypeDescriptionDefault
query

input query

required

Returns:

TypeDescription

query reduced down to keyword terms

Source code in txtai/embeddings/base.py

  1. def terms(self, query):
  2. """
  3. Extracts keyword terms from a query.
  4. Args:
  5. query: input query
  6. Returns:
  7. query reduced down to keyword terms
  8. """
  9. return self.batchterms([query])[0]

transform(self, document)

Transforms document into an embeddings vector.

Parameters:

NameTypeDescriptionDefault
documents

iterable of (id, data, tags), (id, data) or data

required

Returns:

TypeDescription

embeddings vector

Source code in txtai/embeddings/base.py

  1. def transform(self, document):
  2. """
  3. Transforms document into an embeddings vector.
  4. Args:
  5. documents: iterable of (id, data, tags), (id, data) or data
  6. Returns:
  7. embeddings vector
  8. """
  9. return self.batchtransform([document])[0]

upsert(self, documents)

Runs an embeddings upsert operation. If the index exists, new data is appended to the index, existing data is updated. If the index doesn’t exist, this method runs a standard index operation.

Parameters:

NameTypeDescriptionDefault
documents

iterable of (id, data, tags), (id, data) or data

required

Source code in txtai/embeddings/base.py

  1. def upsert(self, documents):
  2. """
  3. Runs an embeddings upsert operation. If the index exists, new data is
  4. appended to the index, existing data is updated. If the index doesn't exist,
  5. this method runs a standard index operation.
  6. Args:
  7. documents: iterable of (id, data, tags), (id, data) or data
  8. """
  9. # Run standard insert if index doesn't exist or it has no records
  10. if not self.count():
  11. self.index(documents)
  12. return
  13. # Create transform and stream
  14. transform = Transform(self, Action.UPSERT)
  15. stream = Stream(self, Action.UPSERT)
  16. with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy") as buffer:
  17. # Load documents into database and transform to vectors
  18. ids, _, embeddings = transform(stream(documents), buffer)
  19. if embeddings is not None:
  20. # Remove principal components from embeddings, if necessary
  21. if self.reducer:
  22. self.reducer(embeddings)
  23. # Normalize embeddings
  24. self.normalize(embeddings)
  25. # Append embeddings to the index
  26. self.ann.append(embeddings)
  27. # Save indexids-ids mapping for indexes with no database
  28. if ids and not self.database:
  29. self.config["ids"] = self.config["ids"] + ids
  30. # Scoring upsert, if necessary
  31. # This must occur before graph upsert in order to be available to the graph
  32. if self.issparse():
  33. self.scoring.upsert()
  34. # Subindexes upsert, if necessary
  35. if self.indexes:
  36. self.indexes.upsert()
  37. # Graph upsert, if necessary
  38. if self.graph:
  39. self.graph.upsert(Search(self, True), self.batchsimilarity)