Skip to content

Commit

Permalink
[HTML search] Add preparatory test coverage before anticipated partia…
Browse files Browse the repository at this point in the history
…l-search refactoring.
  • Loading branch information
jayaddison committed Jul 16, 2024
1 parent 8b4709d commit 70788b6
Show file tree
Hide file tree
Showing 9 changed files with 285 additions and 10 deletions.
67 changes: 65 additions & 2 deletions sphinx/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import json
import pickle
import re
from collections import defaultdict
from importlib import import_module
from os import path
from typing import IO, TYPE_CHECKING, Any
from typing import IO, TYPE_CHECKING, Any, Union

from docutils import nodes
from docutils.nodes import Element, Node
Expand All @@ -21,6 +22,8 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from typing_extensions import TypeAlias


class SearchLanguage:
"""
Expand Down Expand Up @@ -250,6 +253,9 @@ class IndexBuilder:
'pickle': pickle
}

_TRIE_CONTENTS = "" # serializable sentinel value
_TrieNode: TypeAlias = dict[str, Union["_TrieNode", list]]

def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
self.env = env
# docname -> title
Expand Down Expand Up @@ -387,12 +393,68 @@ def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int
rv[k] = sorted(fn2index[fn] for fn in v if fn in fn2index)
return rvs

@staticmethod
def _terms_ngrams(terms: dict[str, Any]) -> dict[str, list[str]]:
"""Extract unique ngrams (currently, trigrams) from the input search terms."""
ngrams: dict[str, list[str]] = defaultdict(list)
for term in terms:
if len(term) >= 3:
for i in range(len(term) - 2):
ngrams[term[i:i + 3]].append(term)
return {ngram: sorted(set(terms)) for ngram, terms in ngrams.items()}

@classmethod
def _ngrams_trie(cls, ngrams: dict[str, list[str]]) -> _TrieNode:
"""
Given a dictionary of terms and a mapping of ngrams to those terms, compress the
mapping into a trie, using an empty-string key to notate the contents (terms
containing the ngram) at each node.
"""
root: IndexBuilder._TrieNode = {}
for ngram, terms in ngrams.items():
location = root
for char in ngram:
location = location.setdefault(char, {}) # type: ignore[assignment]
location[cls._TRIE_CONTENTS] = terms
return root

@classmethod
def _minify_trie(cls, trie: _TrieNode, term_offsets: dict[str, int]) -> _TrieNode:
"""Minify the representation of an ngram-to-terms trie datastructure."""
for key, subnode in trie.copy().items():
if not isinstance(subnode, dict):
continue

# Replace the string values of terms with their numeric termlist offset.
if node_terms := subnode.get(cls._TRIE_CONTENTS, []):
offsets = [term_offsets[term] for term in node_terms]
# Replace single-valued contents with integers instead of list values.
subnode[cls._TRIE_CONTENTS] = offsets[0] if len(offsets) == 1 else offsets # type: ignore[assignment]

# Replace single-leaf content nodes (dictionaries) with their (list) contents.
if len(subnode) == 1 and cls._TRIE_CONTENTS in subnode:
trie[key] = subnode[cls._TRIE_CONTENTS]
continue

# Abbreviate unambiguous paths; (a.empty -> b.empty -> c) becomes (abc).
trie[key] = IndexBuilder._minify_trie(subnode, term_offsets)
if len(trie[key]) == 1 and cls._TRIE_CONTENTS not in trie[key]:
subkey, subnode = trie.pop(key).popitem() # type: ignore[union-attr]
trie[key + subkey] = subnode

return trie

def freeze(self) -> dict[str, Any]:
"""Create a usable data structure for serializing."""
docnames, titles = zip(*sorted(self._titles.items()))
filenames = [self._filenames.get(docname) for docname in docnames]
fn2index = {f: i for (i, f) in enumerate(docnames)}
terms, title_terms = self.get_terms(fn2index)
term_offsets = {term: idx for idx, term in enumerate(sorted(terms))}

terms_ngrams_mapping = self._terms_ngrams(terms)
terms_ngrams_trie = self._ngrams_trie(terms_ngrams_mapping) # compress into a trie
terms_ngrams = self._minify_trie(terms_ngrams_trie, term_offsets) # remove redundancy

objects = self.get_objects(fn2index) # populates _objtypes
objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
Expand All @@ -411,7 +473,8 @@ def freeze(self) -> dict[str, Any]:
return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
objects=objects, objtypes=objtypes, objnames=objnames,
titleterms=title_terms, envversion=self.env.version,
alltitles=alltitles, indexentries=index_entries)
alltitles=alltitles, indexentries=index_entries,
termsngrams=terms_ngrams)

def label(self) -> str:
return f"{self.lang.language_name} (code: {self.lang.lang})"
Expand Down
24 changes: 21 additions & 3 deletions sphinx/themes/basic/static/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ const Search = {
performTermsSearch: (searchTerms, excludedTerms) => {
// prepare search
const terms = Search._index.terms;
const termsNgrams = Search._index.termsngrams;
const titleTerms = Search._index.titleterms;
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
Expand All @@ -510,9 +511,26 @@ const Search = {
if (word.length > 2) {
const escapedWord = _escapeRegExp(word);
if (!terms.hasOwnProperty(word)) {
Object.keys(terms).forEach((term) => {
if (term.match(escapedWord))
arr.push({ files: terms[term], score: Scorer.partialTerm });
const termOffsets = Object.keys(terms);
let ngramTerms = function (ngram) {
let [node, path] = [termsNgrams, ""];
for (const step of ngram) {
if ((path += step) in node) [node, path] = [node[path], ""];
}
if (node.length === undefined) node = [node];
return node.map(offset => termOffsets[offset]);
};

const candidateTerms = new Set(ngramTerms(word.substring(0, 3)));
for (let start = 1; candidateTerms.size && start + 3 <= word.length; start++) {
const subsequentTerms = new Set(ngramTerms(word.substring(start, start + 3)));
for (const candidateTerm of candidateTerms) {
if (!subsequentTerms.has(candidateTerm)) candidateTerms.delete(candidateTerm);
}
}

candidateTerms.forEach((term) => {
arr.push({ files: terms[term], score: Scorer.partialTerm });
});
}
if (!titleTerms.hasOwnProperty(word)) {
Expand Down
2 changes: 1 addition & 1 deletion tests/js/fixtures/cpp/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/js/fixtures/multiterm/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/js/fixtures/partial/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 70788b6

Please sign in to comment.