From 70788b62495ab1659504fc9f24574837add83a45 Mon Sep 17 00:00:00 2001 From: James Addison Date: Mon, 15 Jul 2024 22:56:28 +0100 Subject: [PATCH] [HTML search] Add preparatory test coverage before anticipated partial-search refactoring. --- sphinx/search/__init__.py | 67 +++++++- sphinx/themes/basic/static/searchtools.js | 24 ++- tests/js/fixtures/cpp/searchindex.js | 2 +- tests/js/fixtures/multiterm/searchindex.js | 2 +- tests/js/fixtures/partial/searchindex.js | 2 +- tests/js/fixtures/titles/searchindex.js | 2 +- tests/js/roots/partial/index.rst | 2 +- tests/js/searchtools.js | 18 +++ tests/test_search.py | 176 +++++++++++++++++++++ 9 files changed, 285 insertions(+), 10 deletions(-) diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index 77eadb8321b..122f9fc51c2 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -7,9 +7,10 @@ import json import pickle import re +from collections import defaultdict from importlib import import_module from os import path -from typing import IO, TYPE_CHECKING, Any +from typing import IO, TYPE_CHECKING, Any, Union from docutils import nodes from docutils.nodes import Element, Node @@ -21,6 +22,8 @@ if TYPE_CHECKING: from collections.abc import Iterable + from typing_extensions import TypeAlias + class SearchLanguage: """ @@ -250,6 +253,9 @@ class IndexBuilder: 'pickle': pickle } + _TRIE_CONTENTS = "" # serializable sentinel value + _TrieNode: TypeAlias = dict[str, Union["_TrieNode", list]] + def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None: self.env = env # docname -> title @@ -387,12 +393,68 @@ def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int rv[k] = sorted(fn2index[fn] for fn in v if fn in fn2index) return rvs + @staticmethod + def _terms_ngrams(terms: dict[str, Any]) -> dict[str, list[str]]: + """Extract unique ngrams (currently, trigrams) from the input search terms.""" + ngrams: dict[str, list[str]] = defaultdict(list) + for term in terms: + if len(term) >= 3: + for i in range(len(term) - 2): + ngrams[term[i:i + 3]].append(term) + return {ngram: sorted(set(terms)) for ngram, terms in ngrams.items()} + + @classmethod + def _ngrams_trie(cls, ngrams: dict[str, list[str]]) -> _TrieNode: + """ + Given a dictionary of terms and a mapping of ngrams to those terms, compress the + mapping into a trie, using an empty-string key to notate the contents (terms + containing the ngram) at each node. + """ + root: IndexBuilder._TrieNode = {} + for ngram, terms in ngrams.items(): + location = root + for char in ngram: + location = location.setdefault(char, {}) # type: ignore[assignment] + location[cls._TRIE_CONTENTS] = terms + return root + + @classmethod + def _minify_trie(cls, trie: _TrieNode, term_offsets: dict[str, int]) -> _TrieNode: + """Minify the representation of an ngram-to-terms trie datastructure.""" + for key, subnode in trie.copy().items(): + if not isinstance(subnode, dict): + continue + + # Replace the string values of terms with their numeric termlist offset. + if node_terms := subnode.get(cls._TRIE_CONTENTS, []): + offsets = [term_offsets[term] for term in node_terms] + # Replace single-valued contents with integers instead of list values. + subnode[cls._TRIE_CONTENTS] = offsets[0] if len(offsets) == 1 else offsets # type: ignore[assignment] + + # Replace single-leaf content nodes (dictionaries) with their (list) contents. + if len(subnode) == 1 and cls._TRIE_CONTENTS in subnode: + trie[key] = subnode[cls._TRIE_CONTENTS] + continue + + # Abbreviate unambiguous paths; (a.empty -> b.empty -> c) becomes (abc). + trie[key] = IndexBuilder._minify_trie(subnode, term_offsets) + if len(trie[key]) == 1 and cls._TRIE_CONTENTS not in trie[key]: + subkey, subnode = trie.pop(key).popitem() # type: ignore[union-attr] + trie[key + subkey] = subnode + + return trie + def freeze(self) -> dict[str, Any]: """Create a usable data structure for serializing.""" docnames, titles = zip(*sorted(self._titles.items())) filenames = [self._filenames.get(docname) for docname in docnames] fn2index = {f: i for (i, f) in enumerate(docnames)} terms, title_terms = self.get_terms(fn2index) + term_offsets = {term: idx for idx, term in enumerate(sorted(terms))} + + terms_ngrams_mapping = self._terms_ngrams(terms) + terms_ngrams_trie = self._ngrams_trie(terms_ngrams_mapping) # compress into a trie + terms_ngrams = self._minify_trie(terms_ngrams_trie, term_offsets) # remove redundancy objects = self.get_objects(fn2index) # populates _objtypes objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()} @@ -411,7 +473,8 @@ def freeze(self) -> dict[str, Any]: return dict(docnames=docnames, filenames=filenames, titles=titles, terms=terms, objects=objects, objtypes=objtypes, objnames=objnames, titleterms=title_terms, envversion=self.env.version, - alltitles=alltitles, indexentries=index_entries) + alltitles=alltitles, indexentries=index_entries, + termsngrams=terms_ngrams) def label(self) -> str: return f"{self.lang.language_name} (code: {self.lang.lang})" diff --git a/sphinx/themes/basic/static/searchtools.js b/sphinx/themes/basic/static/searchtools.js index b08d58c9b9b..226d5b5ab13 100644 --- a/sphinx/themes/basic/static/searchtools.js +++ b/sphinx/themes/basic/static/searchtools.js @@ -491,6 +491,7 @@ const Search = { performTermsSearch: (searchTerms, excludedTerms) => { // prepare search const terms = Search._index.terms; + const termsNgrams = Search._index.termsngrams; const titleTerms = Search._index.titleterms; const filenames = Search._index.filenames; const docNames = Search._index.docnames; @@ -510,9 +511,26 @@ const Search = { if (word.length > 2) { const escapedWord = _escapeRegExp(word); if (!terms.hasOwnProperty(word)) { - Object.keys(terms).forEach((term) => { - if (term.match(escapedWord)) - arr.push({ files: terms[term], score: Scorer.partialTerm }); + const termOffsets = Object.keys(terms); + let ngramTerms = function (ngram) { + let [node, path] = [termsNgrams, ""]; + for (const step of ngram) { + if ((path += step) in node) [node, path] = [node[path], ""]; + } + if (node.length === undefined) node = [node]; + return node.map(offset => termOffsets[offset]); + }; + + const candidateTerms = new Set(ngramTerms(word.substring(0, 3))); + for (let start = 1; candidateTerms.size && start + 3 <= word.length; start++) { + const subsequentTerms = new Set(ngramTerms(word.substring(start, start + 3))); + for (const candidateTerm of candidateTerms) { + if (!subsequentTerms.has(candidateTerm)) candidateTerms.delete(candidateTerm); + } + } + + candidateTerms.forEach((term) => { + arr.push({ files: terms[term], score: Scorer.partialTerm }); }); } if (!titleTerms.hasOwnProperty(word)) { diff --git a/tests/js/fixtures/cpp/searchindex.js b/tests/js/fixtures/cpp/searchindex.js index 46f48244741..eeb21e2046f 100644 --- a/tests/js/fixtures/cpp/searchindex.js +++ b/tests/js/fixtures/cpp/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {}, "docnames": ["index"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {"sphinx (c++ class)": [[0, "_CPPv46Sphinx", false]]}, "objects": {"": [[0, 0, 1, "_CPPv46Sphinx", "Sphinx"]]}, "objnames": {"0": ["cpp", "class", "C++ class"]}, "objtypes": {"0": "cpp:class"}, "terms": {"The": 0, "becaus": 0, "c": 0, "can": 0, "cardin": 0, "challeng": 0, "charact": 0, "class": 0, "descript": 0, "drop": 0, "engin": 0, "fixtur": 0, "frequent": 0, "gener": 0, "i": 0, "index": 0, "inflat": 0, "mathemat": 0, "occur": 0, "often": 0, "project": 0, "punctuat": 0, "queri": 0, "relat": 0, "sampl": 0, "search": 0, "size": 0, "sphinx": 0, "term": 0, "thei": 0, "thi": 0, "token": 0, "us": 0, "web": 0, "would": 0}, "titles": ["<no title>"], "titleterms": {}}) \ No newline at end of file +Search.setIndex({"alltitles": {}, "docnames": ["index"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {"sphinx (c++ class)": [[0, "_CPPv46Sphinx", false]]}, "objects": {"": [[0, 0, 1, "_CPPv46Sphinx", "Sphinx"]]}, "objnames": {"0": ["cpp", "class", "C++ class"]}, "objtypes": {"0": "cpp:class"}, "terms": {"The": 0, "becaus": 0, "c": 0, "can": 0, "cardin": 0, "challeng": 0, "charact": 0, "class": 0, "descript": 0, "drop": 0, "engin": 0, "fixtur": 0, "frequent": 0, "gener": 0, "i": 0, "index": 0, "inflat": 0, "mathemat": 0, "occur": 0, "often": 0, "project": 0, "punctuat": 0, "queri": 0, "relat": 0, "sampl": 0, "search": 0, "size": 0, "sphinx": 0, "term": 0, "thei": 0, "thi": 0, "token": 0, "us": 0, "web": 0, "would": 0}, "termsngrams": {"The": 0, "a": {"ct": 6, "ll": 5, "mp": 24, "r": {"a": 6, "c": 25, "d": 4}, "ss": 7, "th": 17, "us": 1}, "bec": 1, "c": {"a": {"n": 3, "r": 4, "u": 1}, "cu": 18, "ha": [5, 6], "la": 7, "ri": 8, "tu": 21, "ur": 18}, "d": {"e": {"s": 8, "x": 15}, "in": 4, "ro": 9}, "e": {"ar": 25, "c": {"a": 1, "t": 20}, "la": 23, "ma": 17, "n": {"e": 13, "g": [5, 10], "t": 12}, "qu": 12, "r": {"i": 22, "m": 28}, "sc": 8}, "f": {"ix": 11, "la": 16, "re": 12, "te": 19}, "g": {"en": 13, "in": 10}, "h": {"a": {"l": 5, "r": 6}, "e": {"i": 29, "m": 17}, "in": 27}, "i": {"n": {"d": 15, "f": 16, "x": 27}, "pt": 8, "xt": 11, "ze": 26}, "jec": 20, "ken": 31, "l": {"a": {"s": 7, "t": [16, 23]}, "en": 5, "le": 5}, "m": {"at": 17, "pl": 24}, "n": {"ct": 21, "de": 15, "er": 13, "fl": 16, "gi": 10}, "o": {"cc": 18, "ft": 19, "je": 20, "ke": 31, "ul": 34}, "p": {"hi": 27, "ro": 20, "un": 21}, "que": [12, 22], "r": {"ac": 6, "ch": 25, "di": 4, "e": {"l": 23, "q": 12}, "ip": 8, "o": {"j": 20, "p": 9}}, "s": {"am": 24, "cr": 8, "ea": 25, "iz": 26, "ph": 27}, "t": {"e": {"n": 19, "r": 28}, "h": {"e": [17, 29], "i": 30}, "ok": 31, "u": {"a": 21, "r": 11}}, "u": {"at": 21, "e": {"n": 12, "r": 22}, "ld": 34, "nc": 21}, "w": {"eb": 33, "ou": 34}, "xtu": 11}, "titles": ["<no title>"], "titleterms": {}}) \ No newline at end of file diff --git a/tests/js/fixtures/multiterm/searchindex.js b/tests/js/fixtures/multiterm/searchindex.js index a868eb6bdcb..995061752ab 100644 --- a/tests/js/fixtures/multiterm/searchindex.js +++ b/tests/js/fixtures/multiterm/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Main Page": [[0, null]]}, "docnames": ["index"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"At": 0, "adjac": 0, "all": 0, "an": 0, "appear": 0, "applic": 0, "ar": 0, "built": 0, "can": 0, "check": 0, "contain": 0, "do": 0, "document": 0, "doesn": 0, "each": 0, "fixtur": 0, "format": 0, "function": 0, "futur": 0, "html": 0, "i": 0, "includ": 0, "match": 0, "messag": 0, "multipl": 0, "multiterm": 0, "order": 0, "other": 0, "output": 0, "perform": 0, "perhap": 0, "phrase": 0, "project": 0, "queri": 0, "requir": 0, "same": 0, "search": 0, "successfulli": 0, "support": 0, "t": 0, "term": 0, "test": 0, "thi": 0, "time": 0, "us": 0, "when": 0, "write": 0}, "titles": ["Main Page"], "titleterms": {"main": 0, "page": 0}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Main Page": [[0, null]]}, "docnames": ["index"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"At": 0, "adjac": 0, "all": 0, "an": 0, "appear": 0, "applic": 0, "ar": 0, "built": 0, "can": 0, "check": 0, "contain": 0, "do": 0, "document": 0, "doesn": 0, "each": 0, "fixtur": 0, "format": 0, "function": 0, "futur": 0, "html": 0, "i": 0, "includ": 0, "match": 0, "messag": 0, "multipl": 0, "multiterm": 0, "order": 0, "other": 0, "output": 0, "perform": 0, "perhap": 0, "phrase": 0, "project": 0, "queri": 0, "requir": 0, "same": 0, "search": 0, "successfulli": 0, "support": 0, "t": 0, "term": 0, "test": 0, "thi": 0, "time": 0, "us": 0, "when": 0, "write": 0}, "termsngrams": {"a": {"ch": 14, "dj": 1, "in": 10, "ll": 2, "me": 35, "pp": [4, 5], "rc": 36, "se": 31, "tc": 22}, "bui": 7, "c": {"an": 8, "ce": 37, "es": 37, "he": 9, "lu": 21, "on": 10, "ti": 17, "um": 12}, "d": {"er": 26, "ja": 1, "o": {"c": 12, "e": 13}}, "e": {"a": {"c": 14, "r": [4, 36]}, "c": {"k": 9, "t": 32}, "nt": 12, "qu": 34, "r": {"f": 29, "h": 30, "i": 33, "m": [25, 40]}, "s": {"n": 13, "s": [23, 37], "t": 41}}, "f": {"ix": 15, "or": [16, 29], "u": {"l": 37, "n": 17, "t": 18}}, "h": {"ap": 30, "e": {"c": 9, "n": 45, "r": 27}, "ra": 31, "tm": 19}, "i": {"lt": 7, "me": 43, "nc": 21, "on": 17, "pl": 24, "te": [25, 46], "xt": 15}, "j": {"ac": 1, "ec": 32}, "l": {"ic": 5, "li": 37, "ti": [24, 25], "ud": 21}, "m": {"at": [16, 22], "e": {"n": 12, "s": 23}, "ul": [24, 25]}, "n": {"c": {"l": 21, "t": 17}, "ta": 10}, "o": {"cu": 12, "es": 13, "je": 32, "nt": 10, "r": {"d": 26, "m": [16, 29], "t": 38}, "th": 27, "ut": 28}, "p": {"e": {"a": 4, "r": [29, 30]}, "hr": 31, "li": 5, "or": 38, "p": {"e": 4, "l": 5, "o": 38}, "ro": 32, "ut": 28}, "qu": {"e": 33, "i": 34}, "r": {"as": 31, "ch": 36, "de": 26, "eq": 34, "fo": 29, "ha": 30, "it": 46, "ma": 16, "oj": 32}, "s": {"a": {"g": 23, "m": 35}, "ea": 36, "fu": 37, "s": {"a": 23, "f": 37}, "u": {"c": 37, "p": 38}}, "t": {"ai": 10, "ch": 22, "e": {"r": [25, 40], "s": 41}, "h": {"e": 27, "i": 42}, "i": {"m": 43, "o": 17, "p": 24, "t": 25}, "ml": 19, "pu": 28, "ur": [15, 18]}, "u": {"cc": 37, "er": 33, "i": {"l": 7, "r": 34}, "l": {"l": 37, "t": [24, 25]}, "me": 12, "nc": 17, "pp": 38, "t": {"p": 28, "u": 18}}, "w": {"he": 45, "ri": 46}, "xtu": 15}, "titles": ["Main Page"], "titleterms": {"main": 0, "page": 0}}) \ No newline at end of file diff --git a/tests/js/fixtures/partial/searchindex.js b/tests/js/fixtures/partial/searchindex.js index 356386af8dd..45327fa43cc 100644 --- a/tests/js/fixtures/partial/searchindex.js +++ b/tests/js/fixtures/partial/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"sphinx_utils module": [[0, null]]}, "docnames": ["index"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"also": 0, "ar": 0, "built": 0, "confirm": 0, "document": 0, "function": 0, "html": 0, "i": 0, "includ": 0, "input": 0, "javascript": 0, "known": 0, "match": 0, "partial": 0, "possibl": 0, "prefix": 0, "project": 0, "provid": 0, "restructuredtext": 0, "sampl": 0, "search": 0, "should": 0, "thi": 0, "titl": 0, "us": 0, "when": 0}, "titles": ["sphinx_utils module"], "titleterms": {"modul": 0, "sphinx_util": 0}}) \ No newline at end of file +Search.setIndex({"alltitles": {"sphinx_utils module": [[0, null]]}, "docnames": ["index"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"ar": 0, "both": 0, "built": 0, "confirm": 0, "document": 0, "function": 0, "html": 0, "i": 0, "includ": 0, "input": 0, "javascript": 0, "match": 0, "partial": 0, "possibl": 0, "project": 0, "provid": 0, "restructuredtext": 0, "sampl": 0, "search": 0, "should": 0, "term": 0, "thi": 0, "titl": 0, "us": 0, "when": 0}, "termsngrams": {"a": {"mp": 17, "r": {"c": 18, "t": 12}, "sc": 10, "tc": 11, "va": 10}, "b": {"ot": 1, "ui": 2}, "c": {"lu": 8, "on": 3, "ri": 10, "t": {"i": 5, "u": 16}, "um": 4}, "d": {"oc": 4, "te": 16}, "e": {"ar": 18, "ct": 14, "dt": 16, "nt": 4, "rm": 20, "st": 16, "xt": 16}, "f": {"ir": 3, "un": 5}, "h": {"en": 24, "ou": 19, "tm": 6}, "i": {"al": 12, "bl": 13, "lt": 2, "n": {"c": 8, "p": 9}, "on": 5, "pt": 10, "rm": 3, "tl": 22}, "j": {"av": 10, "ec": 14}, "lud": 8, "m": {"at": 11, "en": 4, "pl": 17}, "n": {"c": {"l": 8, "t": 5}, "fi": 3, "pu": 9}, "o": {"cu": 4, "je": 14, "nf": 3, "ss": 13, "th": 1, "ul": 19, "vi": 15}, "p": {"ar": 12, "os": 13, "ro": [14, 15], "ut": 9}, "r": {"ch": 18, "e": {"d": 16, "s": 16}, "ip": 10, "o": {"j": 14, "v": 15}, "ti": 12, "uc": 16}, "s": {"am": 17, "cr": 10, "ea": 18, "ho": 19, "ib": 13, "si": 13, "tr": 16}, "t": {"ch": 11, "e": {"r": 20, "x": 16}, "hi": 21, "i": {"a": 12, "o": 5, "t": 22}, "ml": 6, "ru": 16, "ur": 16}, "u": {"ct": 16, "il": 2, "ld": 19, "me": 4, "nc": 5, "re": 16}, "v": {"as": 10, "id": 15}, "whe": 24}, "titles": ["sphinx_utils module"], "titleterms": {"modul": 0, "sphinx_util": 0}}) \ No newline at end of file diff --git a/tests/js/fixtures/titles/searchindex.js b/tests/js/fixtures/titles/searchindex.js index 9a229d060bf..3422c262b94 100644 --- a/tests/js/fixtures/titles/searchindex.js +++ b/tests/js/fixtures/titles/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Main Page": [[0, null]], "Relevance": [[0, "relevance"], [1, null]]}, "docnames": ["index", "relevance"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst", "relevance.rst"], "indexentries": {"example (class in relevance)": [[0, "relevance.Example", false]], "module": [[0, "module-relevance", false]], "relevance": [[0, "module-relevance", false]], "relevance (relevance.example attribute)": [[0, "relevance.Example.relevance", false]]}, "objects": {"": [[0, 0, 0, "-", "relevance"]], "relevance": [[0, 1, 1, "", "Example"]], "relevance.Example": [[0, 2, 1, "", "relevance"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute"}, "terms": {"": [0, 1], "A": 1, "For": 1, "In": [0, 1], "against": 0, "also": 1, "an": 0, "answer": 0, "appear": 1, "ar": 1, "area": 0, "ask": 0, "attribut": 0, "built": 1, "can": [0, 1], "class": 0, "code": [0, 1], "consid": 1, "contain": 0, "context": 0, "corpu": 1, "could": 1, "demonstr": 0, "describ": 1, "detail": 1, "determin": 1, "docstr": 0, "document": [0, 1], "domain": 1, "engin": 0, "exampl": [0, 1], "extract": 0, "find": 0, "found": 0, "from": 0, "function": 1, "ha": 1, "handl": 0, "happen": 1, "head": 0, "help": 0, "highli": 1, "how": 0, "i": [0, 1], "improv": 0, "inform": 0, "intend": 0, "issu": 1, "itself": 1, "knowledg": 0, "languag": 1, "less": 1, "like": [0, 1], "match": 0, "mention": 1, "name": [0, 1], "object": 0, "one": 1, "onli": 1, "other": 0, "page": 1, "part": 1, "particular": 0, "printf": 1, "program": 1, "project": 0, "queri": [0, 1], "question": 0, "re": 0, "rel": 0, "research": 0, "result": 1, "sai": 0, "same": 1, "score": 0, "search": [0, 1], "seem": 0, "softwar": 1, "some": 1, "sphinx": 0, "straightforward": 1, "subject": 0, "subsect": 0, "term": [0, 1], "test": 0, "text": 0, "than": 1, "thei": 0, "them": 0, "thi": 0, "titl": 0, "user": [0, 1], "we": [0, 1], "when": 0, "whether": 1, "within": 0, "would": 1}, "titles": ["Main Page", "Relevance"], "titleterms": {"main": 0, "page": 0, "relev": [0, 1]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Main Page": [[0, null]], "Relevance": [[0, "relevance"], [1, null]]}, "docnames": ["index", "relevance"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["index.rst", "relevance.rst"], "indexentries": {"example (class in relevance)": [[0, "relevance.Example", false]], "module": [[0, "module-relevance", false]], "relevance": [[0, "module-relevance", false]], "relevance (relevance.example attribute)": [[0, "relevance.Example.relevance", false]]}, "objects": {"": [[0, 0, 0, "-", "relevance"]], "relevance": [[0, 1, 1, "", "Example"]], "relevance.Example": [[0, 2, 1, "", "relevance"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute"}, "terms": {"": [0, 1], "A": 1, "For": 1, "In": [0, 1], "against": 0, "also": 1, "an": 0, "answer": 0, "appear": 1, "ar": 1, "area": 0, "ask": 0, "attribut": 0, "built": 1, "can": [0, 1], "class": 0, "code": [0, 1], "consid": 1, "contain": 0, "context": 0, "corpu": 1, "could": 1, "demonstr": 0, "describ": 1, "detail": 1, "determin": 1, "docstr": 0, "document": [0, 1], "domain": 1, "engin": 0, "exampl": [0, 1], "extract": 0, "find": 0, "found": 0, "from": 0, "function": 1, "ha": 1, "handl": 0, "happen": 1, "head": 0, "help": 0, "highli": 1, "how": 0, "i": [0, 1], "improv": 0, "inform": 0, "intend": 0, "issu": 1, "itself": 1, "knowledg": 0, "languag": 1, "less": 1, "like": [0, 1], "match": 0, "mention": 1, "name": [0, 1], "object": 0, "one": 1, "onli": 1, "other": 0, "page": 1, "part": 1, "particular": 0, "printf": 1, "program": 1, "project": 0, "queri": [0, 1], "question": 0, "re": 0, "rel": 0, "research": 0, "result": 1, "sai": 0, "same": 1, "score": 0, "search": [0, 1], "seem": 0, "softwar": 1, "some": 1, "sphinx": 0, "straightforward": 1, "subject": 0, "subsect": 0, "term": [0, 1], "test": 0, "text": 0, "than": 1, "thei": 0, "them": 0, "thi": 0, "titl": 0, "user": [0, 1], "we": [0, 1], "when": 0, "whether": 1, "within": 0, "would": 1}, "termsngrams": {"For": 2, "a": {"ct": 31, "g": {"a": 4, "e": 60}, "i": {"g": 80, "l": 24, "n": [4, 18, 28]}, "ls": 5, "m": {"e": [55, 73], "p": 30}, "n": {"d": 37, "g": 50, "s": 7}, "pp": [8, 38], "r": {"c": [70, 75], "d": 80, "e": 10, "t": [61, 62]}, "s": {"k": 11, "s": 15}, "t": {"c": 53, "t": 12}}, "b": {"je": [56, 81], "se": 82, "u": {"i": 13, "t": 12}}, "c": {"an": 14, "la": 15, "o": {"d": 16, "n": [17, 18, 19], "r": [20, 74], "u": 21}, "ri": 23, "st": 26, "ti": 35, "u": {"l": 62, "m": 27}}, "d": {"e": {"m": 22, "s": 23, "t": [24, 25]}, "o": {"c": [26, 27], "m": 28}}, "e": {"a": {"d": 39, "r": [8, 70, 75]}, "ct": [56, 65, 81, 82], "dg": 49, "em": 76, "l": {"f": 48, "p": 40}, "mo": 22, "n": {"d": 46, "g": 29, "t": [27, 54]}, "r": {"i": 66, "m": [25, 83]}, "s": {"c": 23, "e": 70, "s": 51, "t": [67, 84], "u": 71}, "t": {"a": 24, "e": 25, "h": 94}, "x": {"a": 30, "t": [19, 31, 85]}}, "f": {"in": 32, "o": {"r": [45, 80], "u": 33}, "ro": 34, "tw": 77, "un": 35}, "g": {"ai": 4, "h": {"l": 41, "t": 80}, "in": 29, "ra": 64, "ua": 50}, "h": {"a": {"n": [37, 86], "p": 38}, "e": {"a": 39, "i": 87, "l": 40, "m": 88, "n": 93, "r": [59, 94], "t": 94}, "i": {"g": 41, "n": [79, 95]}, "li": 41, "ow": 42, "tf": 80}, "i": {"bu": 12, "cu": 62, "gh": [41, 80], "ke": 52, "lt": 13, "mp": 44, "n": {"d": 32, "f": 45, "s": 4, "t": [46, 63], "x": 79}, "on": [35, 54, 67], "ss": 47, "t": {"h": 95, "l": 90, "s": 48}}, "jec": [56, 65, 81], "kno": 49, "l": {"a": {"n": 50, "r": 62, "s": 15}, "e": {"d": 49, "s": 51}, "ik": 52, "so": 5}, "m": {"a": {"i": 28, "t": 53}, "en": [27, 54], "in": 25, "on": 22, "p": {"l": 30, "r": 44}}, "n": {"am": 55, "ct": 35, "dl": 37, "fo": 45, "g": {"i": 29, "u": 50}, "li": 58, "ow": 49, "s": {"i": 17, "t": [4, 22], "w": 7}, "t": {"a": 18, "e": [19, 46], "f": 63, "i": 54}}, "o": {"bj": 56, "c": {"s": 26, "u": 27}, "de": 16, "ft": 77, "gr": 64, "je": 65, "m": {"a": 28, "e": 78}, "n": {"e": 57, "l": 58, "s": [17, 22], "t": [18, 19]}, "r": {"e": 74, "m": 45, "p": 20, "w": 80}, "th": 59, "u": {"l": [21, 96], "n": 33}, "wl": 49}, "p": {"a": {"g": 60, "r": [61, 62]}, "e": {"a": 8, "n": 38}, "hi": 79, "pe": [8, 38], "r": {"i": 63, "o": [44, 64, 65]}}, "que": [66, 67], "r": {"a": {"c": 31, "i": 80, "m": 64}, "ch": [70, 75], "e": {"a": 10, "l": 69, "s": [70, 71]}, "i": {"b": [12, 23], "n": 63}, "mi": 25, "o": {"g": 64, "j": 65, "m": 34, "v": 44}, "pu": 20, "ti": 62, "wa": 80}, "s": {"a": {"i": 72, "m": 73}, "c": {"o": 74, "r": 23}, "e": {"a": [70, 75], "c": 82, "e": 76, "l": 48, "r": 91}, "id": 17, "o": {"f": 77, "m": 78}, "ph": 79, "su": 47, "t": {"i": 67, "r": [22, 26, 80]}, "u": {"b": [81, 82], "l": 71}, "we": 7}, "t": {"ai": [18, 24], "ch": 53, "e": {"n": 46, "r": [25, 83], "s": 84, "x": [19, 85]}, "fo": 80, "h": {"a": 86, "e": [59, 87, 88, 94], "i": [89, 95]}, "i": {"c": 62, "o": [35, 54, 67], "t": 90}, "r": {"a": [31, 80], "i": 12}, "se": 48, "tr": 12, "wa": 77}, "u": {"ag": 50, "b": {"j": 81, "s": 82}, "e": {"r": 66, "s": 67}, "il": 13, "l": {"a": 62, "d": [21, 96], "t": 71}, "me": 27, "n": {"c": 35, "d": 33}, "se": 91}, "w": {"ar": [77, 80], "er": 7, "he": [93, 94], "it": 95, "le": 49, "ou": 96}, "x": {"am": 30, "tr": 31}}, "titles": ["Main Page", "Relevance"], "titleterms": {"main": 0, "page": 0, "relev": [0, 1]}}) \ No newline at end of file diff --git a/tests/js/roots/partial/index.rst b/tests/js/roots/partial/index.rst index 6a9561b3994..23fba6dd3eb 100644 --- a/tests/js/roots/partial/index.rst +++ b/tests/js/roots/partial/index.rst @@ -1,7 +1,7 @@ sphinx_utils module =================== -Partial (also known as "prefix") matches on document titles should be possible +Partial matches on document titles and document terms should both be possible using the JavaScript search functionality included when HTML documentation projects are built. diff --git a/tests/js/searchtools.js b/tests/js/searchtools.js index a71047dae9f..16ab2c1e358 100644 --- a/tests/js/searchtools.js +++ b/tests/js/searchtools.js @@ -78,6 +78,24 @@ describe('Basic html theme search', function() { expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits); }); + it('should partially-match within "possible" when in term index', function() { + eval(loadFixture("partial/searchindex.js")); + + [_searchQuery, searchterms, excluded, ..._remainingItems] = Search._parseQuery('ossibl'); + terms = Search._index.terms; + titleterms = Search._index.titleterms; + + hits = [[ + "index", + "sphinx_utils module", + "", + null, + 2, + "index.rst" + ]]; + expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits); + }); + }); describe('aggregation of search results', function() { diff --git a/tests/test_search.py b/tests/test_search.py index 3687911e488..e95213d5fa0 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -209,6 +209,32 @@ def test_IndexBuilder(): 'index': [0, 1, 2, 3], 'non': [0, 1, 2, 3], 'test': [0, 1, 2, 3]}, + 'termsngrams': { + 'com': 1, + 'dex': 3, + 'e': { + 'nt': 1, + 'rm': 2, + 'st': 5, + }, + 'fer': 2, + 'i': { + 'nd': 3, + 'on': 2, + }, + 'm': { + 'en': 1, + 'io': 2, + 'me': 1, + }, + 'n': { + 'de': 3, + 'on': 4, + }, + 'omm': 1, + 'rmi': 2, + 'tes': 5, + }, 'titles': ('title1_1', 'title1_2', 'title2_1', 'title2_2'), 'titleterms': { 'another_titl': [0, 1, 2, 3], @@ -279,6 +305,32 @@ def test_IndexBuilder(): 'index': [0, 1], 'non': [0, 1], 'test': [0, 1]}, + 'termsngrams': { + 'com': 1, + 'dex': 3, + 'e': { + 'nt': 1, + 'rm': 2, + 'st': 5, + }, + 'fer': 2, + 'i': { + 'nd': 3, + 'on': 2, + }, + 'm': { + 'en': 1, + 'io': 2, + 'me': 1, + }, + 'n': { + 'de': 3, + 'on': 4, + }, + 'omm': 1, + 'rmi': 2, + 'tes': 5, + }, 'titles': ('title1_2', 'title2_2'), 'titleterms': { 'another_titl': [0, 1], @@ -307,6 +359,130 @@ def test_IndexBuilder_lookup(): assert index.lang.lang == 'zh' +def test_IndexBuilder_extract_ngrams(): + input_terms = { + 'Aab': 3, + 'b': 0, + 'aaa': 0, + 'ccdd': [1, 2], + 'aaab': 0, + 'bbbb': [0, 1], + 'abcd': [3, 4, 5], + } + + expected_ngrams = { + 'Aab': ['Aab'], + # 'b': ['b'], + 'aaa': ['aaa', 'aaab'], + 'aab': ['aaab'], + 'abc': ['abcd'], + 'bbb': ['bbbb'], + 'bcd': ['abcd'], + 'ccd': ['ccdd'], + 'cdd': ['ccdd'], + } + + actual_ngrams = IndexBuilder._terms_ngrams(input_terms) + assert expected_ngrams == actual_ngrams + + +def test_IndexBuilder_build_trie(): + input_ngrams = { + 'Aab': ['Aab'], + 'b': ['b'], + 'aaa': ['aaa', 'aaab'], + 'aab': ['aaab'], + 'abc': ['abcd'], + 'bbb': ['bbbb'], + 'bcd': ['abcd'], + 'ccd': ['ccdd'], + 'cdd': ['ccdd'], + } + + CONTAINS = '' # empty-string is used to denote node contents + + expected_trie = { + 'A': {'a': {'b': {CONTAINS: ['Aab']}}}, + 'a': { + 'a': { + 'a': {CONTAINS: ['aaa', 'aaab']}, + 'b': {CONTAINS: ['aaab']}, + }, + 'b': {'c': {CONTAINS: ['abcd']}}, + }, + 'b': { + CONTAINS: ['b'], + 'b': {'b': {CONTAINS: ['bbbb']}}, + 'c': {'d': {CONTAINS: ['abcd']}}, + }, + 'c': { + 'c': {'d': {CONTAINS: ['ccdd']}}, + 'd': {'d': {CONTAINS: ['ccdd']}}, + }, + } + + assert expected_trie == IndexBuilder._ngrams_trie(input_ngrams) + + +def test_IndexBuilder_minify_trie(): + input_terms = { + 'Aab': 1, + 'aaa': 0, + 'aaab': 0, + 'abcd': [3, 4, 5], + 'b': 0, + 'bbbb': [0, 1], + 'ccdd': [1, 2], + } + + CONTAINS = '' # empty-string is used to denote node contents + + input_trie = { + 'A': {'a': {'b': {CONTAINS: ['Aab']}}}, + 'a': { + 'a': { + 'a': {CONTAINS: ['aaa', 'aaab']}, + 'b': {CONTAINS: ['aaab']}, + }, + 'b': { + 'c': {CONTAINS: ['abcd']}, + }, + }, + 'b': { + CONTAINS: ['b'], + 'b': {'b': {CONTAINS: ['bbbb']}}, + 'c': {'d': {CONTAINS: ['abcd']}}, + }, + 'c': { + 'c': {'d': {CONTAINS: ['ccdd']}}, + 'd': {'d': {CONTAINS: ['ccdd']}}, + }, + } + + expected_trie = { + 'Aab': 0, # Aab + 'a': { + 'a': { + 'a': [1, 2], # aaa, aaab + 'b': 2, # aaab + }, + 'bc': 3, # abcd + }, + 'b': { + CONTAINS: 4, + 'bb': 5, # bbbb + 'cd': 3, # abcd + }, + 'c': { + 'cd': 6, # ccdd + 'dd': 6, # ccdd + }, + } + + term_offsets = {term: idx for idx, term in enumerate(input_terms)} + assert expected_trie == IndexBuilder._minify_trie(input_trie, term_offsets) + + @pytest.mark.sphinx( testroot='search', confoverrides={'html_search_language': 'zh'},