From cdd2100b4bdccd1f9ce38d15b6f41383dcf1d4ad Mon Sep 17 00:00:00 2001 From: Florian Schade Date: Mon, 23 Oct 2023 13:40:37 +0200 Subject: [PATCH] enhancement: improve content extraction stop word cleaning (#7553) * enhancement: improve content extraction stop word cleaning * fix: cleanup documentation Co-authored-by: Martin * fix: failing tika stop word unit tests --------- Co-authored-by: Martin --- ...ement-search-content-extraction-cleanup.md | 15 ++++++++ services/search/README.md | 3 ++ services/search/pkg/config/content.go | 3 +- .../pkg/config/defaults/defaultconfig.go | 3 +- services/search/pkg/content/content.go | 14 ++++++++ services/search/pkg/content/content_test.go | 36 +++++++++++++++++++ services/search/pkg/content/tika.go | 14 ++++---- services/search/pkg/content/tika_test.go | 21 +++++++++-- 8 files changed, 98 insertions(+), 11 deletions(-) create mode 100644 changelog/unreleased/enhancement-search-content-extraction-cleanup.md create mode 100644 services/search/pkg/content/content_test.go diff --git a/changelog/unreleased/enhancement-search-content-extraction-cleanup.md b/changelog/unreleased/enhancement-search-content-extraction-cleanup.md new file mode 100644 index 0000000000..566b15f779 --- /dev/null +++ b/changelog/unreleased/enhancement-search-content-extraction-cleanup.md @@ -0,0 +1,15 @@ +Enhancement: Tika content extraction cleanup for search + +So far it has not been possible to determine whether the +content for search should be cleaned up of 'stop words' or not. +Stop words are filling words like "I, you, have, am" etc and +defined by the search engine. + +The behaviour can now be set with the newly introduced settings option `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS=false` +which is enabled by default. + +In addition, the stop word cleanup is no longer as aggressive and now ignores numbers, urls, +basically everything except the defined stop words. + +https://github.com/owncloud/ocis/pull/7553 +https://github.com/owncloud/ocis/issues/6674 diff --git a/services/search/README.md b/services/search/README.md index 82df9ca3dc..739fc3a677 100644 --- a/services/search/README.md +++ b/services/search/README.md @@ -70,6 +70,9 @@ When the search service can reach Tika, it begins to read out the content on dem Content extraction and handling the extracted content can be very resource intensive. Content extraction is therefore limited to files with a certain file size. The default limit is 20MB and can be configured using the `SEARCH_CONTENT_EXTRACTION_SIZE_LIMIT` variable. +When extracting the content you can specify whether filler words are ignored or not. +To keep them, the environment variable `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS` must be set to false. + When using the Tika container and docker-compose, consider the following: * See the [ocis_wopi](https://github.com/owncloud/ocis/tree/master/deployments/examples/ocis_wopi) example. diff --git a/services/search/pkg/config/content.go b/services/search/pkg/config/content.go index 1159bf22c3..df753056a6 100644 --- a/services/search/pkg/config/content.go +++ b/services/search/pkg/config/content.go @@ -9,5 +9,6 @@ type Extractor struct { // ExtractorTika configures the Tika extractor type ExtractorTika struct { - TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."` + TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."` + CleanStopWords bool `yaml:"clean_stop_words" env:"SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS" desc:"Defines if stop words should be cleaned or not. See the documentation for more details."` } diff --git a/services/search/pkg/config/defaults/defaultconfig.go b/services/search/pkg/config/defaults/defaultconfig.go index c7575081c7..3f4cce3e0a 100644 --- a/services/search/pkg/config/defaults/defaultconfig.go +++ b/services/search/pkg/config/defaults/defaultconfig.go @@ -43,7 +43,8 @@ func DefaultConfig() *config.Config { Type: "basic", CS3AllowInsecure: false, Tika: config.ExtractorTika{ - TikaURL: "http://127.0.0.1:9998", + TikaURL: "http://127.0.0.1:9998", + CleanStopWords: true, }, }, Events: config.Events{ diff --git a/services/search/pkg/content/content.go b/services/search/pkg/content/content.go index e58f35dc43..ef334e0706 100644 --- a/services/search/pkg/content/content.go +++ b/services/search/pkg/content/content.go @@ -1,5 +1,15 @@ package content +import ( + "strings" + + "github.com/bbalet/stopwords" +) + +func init() { + stopwords.OverwriteWordSegmenter(`[^ ]+`) +} + // Document wraps all resource meta fields, // it is used as a content extraction result. type Document struct { @@ -11,3 +21,7 @@ type Document struct { MimeType string Tags []string } + +func CleanString(content, langCode string) string { + return strings.TrimSpace(stopwords.CleanString(content, langCode, true)) +} diff --git a/services/search/pkg/content/content_test.go b/services/search/pkg/content/content_test.go new file mode 100644 index 0000000000..17631fe4d7 --- /dev/null +++ b/services/search/pkg/content/content_test.go @@ -0,0 +1,36 @@ +package content_test + +import ( + "testing" + + . "github.com/stretchr/testify/assert" + + "github.com/owncloud/ocis/v2/services/search/pkg/content" +) + +func TestCleanContent(t *testing.T) { + tests := []struct { + given string + expect string + }{ + { + given: "find can keeper should keeper will", + expect: "keeper keeper", + }, + { + given: "user1 shares the file to Marie", + expect: "user1 shares file marie", + }, + { + given: "content contains https://localhost/remote.php/dav/files/admin/Photos/San%20Francisco.jpg and stop word", + expect: "content contains https://localhost/remote.php/dav/files/admin/photos/san%20francisco.jpg stop word", + }, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.given, func(t *testing.T) { + Equal(t, tc.expect, content.CleanString(tc.given, "en")) + }) + } +} diff --git a/services/search/pkg/content/tika.go b/services/search/pkg/content/tika.go index 1979cdbcdb..b2953170b4 100644 --- a/services/search/pkg/content/tika.go +++ b/services/search/pkg/content/tika.go @@ -5,11 +5,11 @@ import ( "fmt" "strings" - "github.com/bbalet/stopwords" gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1" provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1" "github.com/cs3org/reva/v2/pkg/rgrpc/todo/pool" "github.com/google/go-tika/tika" + "github.com/owncloud/ocis/v2/ocis-pkg/log" "github.com/owncloud/ocis/v2/services/search/pkg/config" ) @@ -20,7 +20,8 @@ type Tika struct { *Basic Retriever tika *tika.Client - contentExtractionSizeLimit uint64 + ContentExtractionSizeLimit uint64 + CleanStopWords bool } // NewTikaExtractor creates a new Tika instance. @@ -41,7 +42,8 @@ func NewTikaExtractor(gatewaySelector pool.Selectable[gateway.GatewayAPIClient], Basic: basic, Retriever: newCS3Retriever(gatewaySelector, logger, cfg.Extractor.CS3AllowInsecure), tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL), - contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit, + ContentExtractionSizeLimit: cfg.ContentExtractionSizeLimit, + CleanStopWords: cfg.Extractor.Tika.CleanStopWords, }, nil } @@ -56,7 +58,7 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, return doc, nil } - if ri.Size > t.contentExtractionSizeLimit { + if ri.Size > t.ContentExtractionSizeLimit { t.logger.Info().Interface("ResourceID", ri.Id).Str("Name", ri.Name).Msg("file exceeds content extraction size limit. skipping.") return doc, nil } @@ -86,8 +88,8 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document, } } - if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" { - doc.Content = stopwords.CleanString(doc.Content, lang, true) + if langCode, _ := t.tika.LanguageString(ctx, doc.Content); langCode != "" && t.CleanStopWords { + doc.Content = CleanString(doc.Content, langCode) } return doc, nil diff --git a/services/search/pkg/content/tika_test.go b/services/search/pkg/content/tika_test.go index fa26118241..fbb5c0578b 100644 --- a/services/search/pkg/content/tika_test.go +++ b/services/search/pkg/content/tika_test.go @@ -11,11 +11,12 @@ import ( provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + "github.com/owncloud/ocis/v2/ocis-pkg/log" conf "github.com/owncloud/ocis/v2/services/search/pkg/config/defaults" "github.com/owncloud/ocis/v2/services/search/pkg/content" contentMocks "github.com/owncloud/ocis/v2/services/search/pkg/content/mocks" - "github.com/stretchr/testify/mock" ) var _ = Describe("Tika", func() { @@ -48,6 +49,7 @@ var _ = Describe("Tika", func() { cfg := conf.DefaultConfig() cfg.Extractor.Tika.TikaURL = srv.URL + cfg.Extractor.Tika.CleanStopWords = true var err error tika, err = content.NewTikaExtractor(nil, log.NewLogger(), cfg) @@ -82,7 +84,7 @@ var _ = Describe("Tika", func() { }) It("removes stop words", func() { - body = "body to test stop words!!! I, you, he, she, it, we, you, they, stay" + body = "body to test stop words!!! against almost everyone" language = "en" doc, err := tika.Extract(context.TODO(), &provider.ResourceInfo{ @@ -90,7 +92,20 @@ var _ = Describe("Tika", func() { Size: 1, }) Expect(err).ToNot(HaveOccurred()) - Expect(doc.Content).To(Equal("body test stop words i stay ")) + Expect(doc.Content).To(Equal("body test stop words!!!")) + }) + + It("keeps stop words", func() { + body = "body to test stop words!!! against almost everyone" + language = "en" + + tika.CleanStopWords = false + doc, err := tika.Extract(context.TODO(), &provider.ResourceInfo{ + Type: provider.ResourceType_RESOURCE_TYPE_FILE, + Size: 1, + }) + Expect(err).ToNot(HaveOccurred()) + Expect(doc.Content).To(Equal(body)) }) }) })