feat(search): improve search index by chunked separated by h2 and h3 (#290)

This commit is contained in:
Dillon 2020-04-30 03:22:52 +08:00 committed by GitHub
parent 108679e137
commit 3096ff6235
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 32 additions and 25 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -116,7 +116,7 @@ enableEmoji = true
# type of search engine ("lunr", "algolia") # type of search engine ("lunr", "algolia")
type = "algolia" type = "algolia"
# max index length of the chunked content # max index length of the chunked content
contentLength = 5000 contentLength = 4000
# placeholder of the search bar # placeholder of the search bar
placeholder = "" placeholder = ""
# max number of results length # max number of results length
@ -310,7 +310,7 @@ enableEmoji = true
# 搜索引擎的类型 ("lunr", "algolia") # 搜索引擎的类型 ("lunr", "algolia")
type = "algolia" type = "algolia"
# 文章内容最长索引长度 # 文章内容最长索引长度
contentLength = 5000 contentLength = 4000
# 搜索框的占位提示语 # 搜索框的占位提示语
placeholder = "" placeholder = ""
# 最大结果数目 # 最大结果数目
@ -500,7 +500,7 @@ enableEmoji = true
# type of search engine ("lunr", "algolia") # type of search engine ("lunr", "algolia")
type = "algolia" type = "algolia"
# max index length of the chunked content # max index length of the chunked content
contentLength = 5000 contentLength = 4000
# placeholder of the search bar # placeholder of the search bar
placeholder = "" placeholder = ""
# max number of results length # max number of results length

View file

@ -222,7 +222,7 @@ Please open the code block below to view the complete sample configuration :(far
# type of search engine ("lunr", "algolia") # type of search engine ("lunr", "algolia")
type = "lunr" type = "lunr"
# max index length of the chunked content # max index length of the chunked content
contentLength = 5000 contentLength = 4000
# placeholder of the search bar # placeholder of the search bar
placeholder = "" placeholder = ""
# {{< version 0.2.1 >}} max number of results length # {{< version 0.2.1 >}} max number of results length
@ -972,7 +972,7 @@ Here is the search configuration in your [site configuration](#site-configuratio
# type of search engine ("lunr", "algolia") # type of search engine ("lunr", "algolia")
type = "lunr" type = "lunr"
# max index length of the chunked content # max index length of the chunked content
contentLength = 5000 contentLength = 4000
# placeholder of the search bar # placeholder of the search bar
placeholder = "" placeholder = ""
# {{< version 0.2.1 >}} max number of results length # {{< version 0.2.1 >}} max number of results length
@ -992,8 +992,8 @@ The following is a comparison of two search engines:
but high bandwidth and low performance (Especially for Chinese which needs a large segmentit library) but high bandwidth and low performance (Especially for Chinese which needs a large segmentit library)
* `algolia`: high performance and low bandwidth, but need to synchronize `index.json` and limit for `contentLength` * `algolia`: high performance and low bandwidth, but need to synchronize `index.json` and limit for `contentLength`
{{< version 0.2.1 >}} The content of the post is separated by `h2` HTML tag to improve query performance and basically implement full-text search. {{< version 0.2.3 >}} The content of the post is separated by `h2` and `h3` HTML tag to improve query performance and basically implement full-text search.
`contentLength` is used to limit the max index length of the part starting with `h2` HTML tag. `contentLength` is used to limit the max index length of the part starting with `h2` and `h3` HTML tag.
{{< /admonition >}} {{< /admonition >}}
{{< admonition tip "Tips about algolia" >}} {{< admonition tip "Tips about algolia" >}}

View file

@ -227,7 +227,7 @@ Please open the code block below to view the complete sample configuration :(far
# type of search engine ("lunr", "algolia") # type of search engine ("lunr", "algolia")
type = "lunr" type = "lunr"
# max index length of the chunked content # max index length of the chunked content
contentLength = 5000 contentLength = 4000
# placeholder of the search bar # placeholder of the search bar
placeholder = "" placeholder = ""
# {{< version 0.2.1 >}} max number of results length # {{< version 0.2.1 >}} max number of results length
@ -977,7 +977,7 @@ Here is the search configuration in your [site configuration](#site-configuratio
# type of search engine ("lunr", "algolia") # type of search engine ("lunr", "algolia")
type = "lunr" type = "lunr"
# max index length of the chunked content # max index length of the chunked content
contentLength = 5000 contentLength = 4000
# placeholder of the search bar # placeholder of the search bar
placeholder = "" placeholder = ""
# {{< version 0.2.1 >}} max number of results length # {{< version 0.2.1 >}} max number of results length
@ -997,8 +997,8 @@ The following is a comparison of two search engines:
but high bandwidth and low performance (Especially for Chinese which needs a large segmentit library) but high bandwidth and low performance (Especially for Chinese which needs a large segmentit library)
* `algolia`: high performance and low bandwidth, but need to synchronize `index.json` and limit for `contentLength` * `algolia`: high performance and low bandwidth, but need to synchronize `index.json` and limit for `contentLength`
{{< version 0.2.1 >}} The content of the post is separated by `h2` HTML tag to improve query performance and basically implement full-text search. {{< version 0.2.3 >}} The content of the post is separated by `h2` and `h3` HTML tag to improve query performance and basically implement full-text search.
`contentLength` is used to limit the max index length of the part starting with `h2` HTML tag. `contentLength` is used to limit the max index length of the part starting with `h2` and `h3` HTML tag.
{{< /admonition >}} {{< /admonition >}}
{{< admonition tip "Tips about algolia" >}} {{< admonition tip "Tips about algolia" >}}

View file

@ -225,7 +225,7 @@ hugo
# 搜索引擎的类型 ("lunr", "algolia") # 搜索引擎的类型 ("lunr", "algolia")
type = "lunr" type = "lunr"
# 文章内容最长索引长度 # 文章内容最长索引长度
contentLength = 5000 contentLength = 4000
# 搜索框的占位提示语 # 搜索框的占位提示语
placeholder = "" placeholder = ""
# 最大结果数目 # 最大结果数目
@ -977,7 +977,7 @@ defaultContentLanguage = "zh-cn"
# 搜索引擎的类型 ("lunr", "algolia") # 搜索引擎的类型 ("lunr", "algolia")
type = "lunr" type = "lunr"
# 文章内容最长索引长度 # 文章内容最长索引长度
contentLength = 5000 contentLength = 4000
# 搜索框的占位提示语 # 搜索框的占位提示语
placeholder = "" placeholder = ""
# 最大结果数目 # 最大结果数目
@ -996,8 +996,8 @@ defaultContentLanguage = "zh-cn"
* `lunr`: 简单, 无需同步 `index.json`, 没有 `contentLength` 的限制, 但占用带宽大且性能低 (特别是中文需要一个较大的分词依赖库) * `lunr`: 简单, 无需同步 `index.json`, 没有 `contentLength` 的限制, 但占用带宽大且性能低 (特别是中文需要一个较大的分词依赖库)
* `algolia`: 高性能并且占用带宽低, 但需要同步 `index.json` 且有 `contentLength` 的限制 * `algolia`: 高性能并且占用带宽低, 但需要同步 `index.json` 且有 `contentLength` 的限制
{{< version 0.2.1 >}} 文章内容被 `h2` HTML 标签切分来提供查询效果并且基本实现全文搜索. {{< version 0.2.3 >}} 文章内容被 `h2``h3` HTML 标签切分来提供查询效果并且基本实现全文搜索.
`contentLength` 用来限制 `h2` HTML 标签开头的内容部分的最大长度. `contentLength` 用来限制 `h2``h3` HTML 标签开头的内容部分的最大长度.
{{< /admonition >}} {{< /admonition >}}
{{< admonition tip "关于 algolia 的使用技巧" >}} {{< admonition tip "关于 algolia 的使用技巧" >}}

View file

@ -14,17 +14,24 @@
{{- end -}} {{- end -}}
{{- $params := .Params | merge $.Site.Params.page -}} {{- $params := .Params | merge $.Site.Params.page -}}
{{- $content := dict "content" .Content "ruby" $params.ruby "fraction" $params.fraction "fontawesome" $params.fontawesome | partial "function/content.html" -}} {{- $content := dict "content" .Content "ruby" $params.ruby "fraction" $params.fraction "fontawesome" $params.fontawesome | partial "function/content.html" -}}
{{- range $i, $chunked := split $content "<h2 id=" -}} {{- range $i, $contenti := split $content "<h2 id=" -}}
{{- if gt $i 0 -}} {{- if gt $i 0 -}}
{{- $chunked = printf "<h2 id=%s" $chunked -}} {{- $contenti = printf "<h2 id=%s" $contenti -}}
{{- end -}} {{- end -}}
{{- $chunked = $chunked | plainify | htmlUnescape | replace "\n" " " | replace "\t" " " | replaceRE " +" " " -}} {{- range $j, $contentj := split $contenti "<h3 id=" -}}
{{- if gt $j 0 -}}
{{- $contentj = printf "<h3 id=%s" $contentj -}}
{{- end -}}
{{- $contentj = $contentj | plainify | htmlUnescape | replaceRE `[\n\t ]+` " " -}}
{{- if gt $.Site.Params.search.contentLength 0 -}} {{- if gt $.Site.Params.search.contentLength 0 -}}
{{- $chunked = substr $chunked 0 $.Site.Params.search.contentLength -}} {{- $contentj = substr $contentj 0 $.Site.Params.search.contentLength -}}
{{- end -}} {{- end -}}
{{- $one := printf "%s:%d" $uri $i | dict "content" $chunked "objectID" | merge $meta -}} {{- if $contentj | and (ne $contentj " ") -}}
{{- $one := printf "%s:%d:%d" $uri $i $j | dict "content" $contentj "objectID" | merge $meta -}}
{{- $index = $index | append $one -}} {{- $index = $index | append $one -}}
{{- end -}} {{- end -}}
{{- end -}}
{{- end -}}
{{- end -}} {{- end -}}
{{- $index | jsonify | safeJS -}} {{- $index | jsonify | safeJS -}}

View file

@ -237,7 +237,7 @@ class Theme {
this._algoliaIndex this._algoliaIndex
.search(query, { .search(query, {
offset: 0, offset: 0,
length: searchConfig.maxResultLength * 3, length: searchConfig.maxResultLength * 10,
attributesToHighlight: ['title'], attributesToHighlight: ['title'],
attributesToSnippet: ['content:30'], attributesToSnippet: ['content:30'],
highlightPreTag: `<${searchConfig.highlightTag}>`, highlightPreTag: `<${searchConfig.highlightTag}>`,