Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML search: Introduce ngram-based partial-match searching #12596

Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
70788b6
[HTML search] Add preparatory test coverage before anticipated partia…
jayaddison Jul 15, 2024
1a08711
Merge branch 'master' into issue-12045/partial-search-ngrams
jayaddison Jul 17, 2024
8c1df09
Add CHANGES.rst entry.
jayaddison Jul 17, 2024
ffc914c
Nitpick: remove redundant dict-lookup fallback value.
jayaddison Jul 17, 2024
996f805
[HTML search] Tests: add coverage for non-matching query.
jayaddison Jul 18, 2024
83cf50c
[HTML search] Tests: cleanup: remove duplicate test accidentally adde…
jayaddison Jul 18, 2024
3ad0139
[HTML search] Nitpick: declare `ngramTerms` helper function as a `con…
jayaddison Jul 18, 2024
ee05a38
Merge branch 'master' into issue-12045/partial-search-ngrams
jayaddison Jul 18, 2024
115a0ef
Merge branch 'master' into issue-12045/partial-search-ngrams
jayaddison Jul 19, 2024
e096a36
Fixup: remove extraneous newline accidentally added to CHANGES.rst.
jayaddison Jul 19, 2024
a59b630
[HTML search] Tests: add some safety-guard tests around ngram suffix…
jayaddison Jul 19, 2024
5253bd5
[HTML search] Optimization: traverse query terms two characters at a …
jayaddison Jul 19, 2024
94daefd
[HTML search] Fixup: return empty when trie-path lookup is incomplete.
jayaddison Jul 19, 2024
d38500b
[HTML search] Safety guard: restore `escapedWord` conditional check b…
jayaddison Jul 19, 2024
fd13f5d
[HTML search] Refactor / optimization: delay term-offset lookup until…
jayaddison Jul 19, 2024
6d885d6
[HTML search] Refactor / brevity: use a JavaScript `Array` instead of…
jayaddison Jul 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Release 7.4.7 (in development)
==============================

Features added
--------------

* #12045: Use an index-directed ngram search for partial-string matches.
Results are unaffected compared to the previous brute-force approach, but
performance should be much more consistent (especially for large datasets).
Patch by James Addison.

Bugs fixed
----------

Expand Down
67 changes: 65 additions & 2 deletions sphinx/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import json
import pickle
import re
from collections import defaultdict
from importlib import import_module
from os import path
from typing import IO, TYPE_CHECKING, Any
from typing import IO, TYPE_CHECKING, Any, Union

from docutils import nodes
from docutils.nodes import Element, Node
Expand All @@ -21,6 +22,8 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from typing_extensions import TypeAlias


class SearchLanguage:
"""
Expand Down Expand Up @@ -250,6 +253,9 @@ class IndexBuilder:
'pickle': pickle
}

_TRIE_CONTENTS = "" # serializable sentinel value
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
_TrieNode: TypeAlias = dict[str, Union["_TrieNode", list]]

def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
self.env = env
# docname -> title
Expand Down Expand Up @@ -387,12 +393,68 @@ def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int
rv[k] = sorted(fn2index[fn] for fn in v if fn in fn2index)
return rvs

@staticmethod
def _terms_ngrams(terms: dict[str, Any]) -> dict[str, list[str]]:
"""Extract unique ngrams (currently, trigrams) from the input search terms."""
ngrams: dict[str, list[str]] = defaultdict(list)
for term in terms:
if len(term) >= 3:
for i in range(len(term) - 2):
ngrams[term[i:i + 3]].append(term)
return {ngram: sorted(set(terms)) for ngram, terms in ngrams.items()}

@classmethod
def _ngrams_trie(cls, ngrams: dict[str, list[str]]) -> _TrieNode:
"""
Given a dictionary of terms and a mapping of ngrams to those terms, compress the
mapping into a trie, using an empty-string key to notate the contents (terms
containing the ngram) at each node.
"""
root: IndexBuilder._TrieNode = {}
for ngram, terms in ngrams.items():
location = root
for char in ngram:
location = location.setdefault(char, {}) # type: ignore[assignment]
location[cls._TRIE_CONTENTS] = terms
return root

@classmethod
def _minify_trie(cls, trie: _TrieNode, term_offsets: dict[str, int]) -> _TrieNode:
"""Minify the representation of an ngram-to-terms trie datastructure."""
for key, subnode in trie.copy().items():
if not isinstance(subnode, dict):
continue

# Replace the string values of terms with their numeric termlist offset.
if node_terms := subnode.get(cls._TRIE_CONTENTS):
offsets = [term_offsets[term] for term in node_terms]
# Replace single-valued contents with integers instead of list values.
subnode[cls._TRIE_CONTENTS] = offsets[0] if len(offsets) == 1 else offsets # type: ignore[assignment]

# Replace single-leaf content nodes (dictionaries) with their (list) contents.
if len(subnode) == 1 and cls._TRIE_CONTENTS in subnode:
trie[key] = subnode[cls._TRIE_CONTENTS]
continue

# Abbreviate unambiguous paths; (a.empty -> b.empty -> c) becomes (abc).
trie[key] = IndexBuilder._minify_trie(subnode, term_offsets)
if len(trie[key]) == 1 and cls._TRIE_CONTENTS not in trie[key]:
subkey, subnode = trie.pop(key).popitem() # type: ignore[union-attr]
trie[key + subkey] = subnode

return trie

def freeze(self) -> dict[str, Any]:
"""Create a usable data structure for serializing."""
docnames, titles = zip(*sorted(self._titles.items()))
filenames = [self._filenames.get(docname) for docname in docnames]
fn2index = {f: i for (i, f) in enumerate(docnames)}
terms, title_terms = self.get_terms(fn2index)
term_offsets = {term: idx for idx, term in enumerate(sorted(terms))}
jayaddison marked this conversation as resolved.
Show resolved Hide resolved

terms_ngrams_mapping = self._terms_ngrams(terms)
terms_ngrams_trie = self._ngrams_trie(terms_ngrams_mapping) # compress into a trie
terms_ngrams = self._minify_trie(terms_ngrams_trie, term_offsets) # remove redundancy

objects = self.get_objects(fn2index) # populates _objtypes
objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
Expand All @@ -411,7 +473,8 @@ def freeze(self) -> dict[str, Any]:
return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms,
objects=objects, objtypes=objtypes, objnames=objnames,
titleterms=title_terms, envversion=self.env.version,
alltitles=alltitles, indexentries=index_entries)
alltitles=alltitles, indexentries=index_entries,
termsngrams=terms_ngrams)

def label(self) -> str:
return f"{self.lang.language_name} (code: {self.lang.lang})"
Expand Down
25 changes: 22 additions & 3 deletions sphinx/themes/basic/static/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ const Search = {
performTermsSearch: (searchTerms, excludedTerms) => {
// prepare search
const terms = Search._index.terms;
const termsNgrams = Search._index.termsngrams;
const titleTerms = Search._index.titleterms;
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
Expand All @@ -510,9 +511,27 @@ const Search = {
if (word.length > 2) {
const escapedWord = _escapeRegExp(word);
if (!terms.hasOwnProperty(word)) {
Object.keys(terms).forEach((term) => {
if (term.match(escapedWord))
arr.push({ files: terms[term], score: Scorer.partialTerm });
const termOffsets = Object.keys(terms);
const ngramTerms = function (ngram) {
let [node, path] = [termsNgrams, ""];
for (const step of ngram) {
if ((path += step) in node) [node, path] = [node[path], ""];
}
if (!node) return [];
if (node.length === undefined) node = [node];
return node.map(offset => termOffsets[offset]);
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
};

const candidateTerms = new Set(ngramTerms(word.substring(0, 3)));
for (let start = 1; candidateTerms.size && start + 3 <= word.length; start++) {
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
const subsequentTerms = new Set(ngramTerms(word.substring(start, start + 3)));
for (const candidateTerm of candidateTerms) {
if (!subsequentTerms.has(candidateTerm)) candidateTerms.delete(candidateTerm);
}
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
}
jayaddison marked this conversation as resolved.
Show resolved Hide resolved

candidateTerms.forEach((term) => {
arr.push({ files: terms[term], score: Scorer.partialTerm });
});
}
if (!titleTerms.hasOwnProperty(word)) {
Expand Down
2 changes: 1 addition & 1 deletion tests/js/fixtures/cpp/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/js/fixtures/multiterm/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/js/fixtures/partial/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading