enhancement: improve content extraction stop word cleaning (#7553)

* enhancement: improve content extraction stop word cleaning

* fix: cleanup documentation

Co-authored-by: Martin <github@diemattels.at>

* fix: failing tika stop word unit tests

---------

Co-authored-by: Martin <github@diemattels.at>
This commit is contained in:
Florian Schade
2023-10-23 13:40:37 +02:00
committed by GitHub
parent e24fce483a
commit cdd2100b4b
8 changed files with 98 additions and 11 deletions

View File

@@ -0,0 +1,15 @@
Enhancement: Tika content extraction cleanup for search
So far it has not been possible to determine whether the
content for search should be cleaned up of 'stop words' or not.
Stop words are filling words like "I, you, have, am" etc and
defined by the search engine.
The behaviour can now be set with the newly introduced settings option `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS=false`
which is enabled by default.
In addition, the stop word cleanup is no longer as aggressive and now ignores numbers, urls,
basically everything except the defined stop words.
https://github.com/owncloud/ocis/pull/7553
https://github.com/owncloud/ocis/issues/6674

View File

@@ -70,6 +70,9 @@ When the search service can reach Tika, it begins to read out the content on dem
Content extraction and handling the extracted content can be very resource intensive. Content extraction is therefore limited to files with a certain file size. The default limit is 20MB and can be configured using the `SEARCH_CONTENT_EXTRACTION_SIZE_LIMIT` variable.
When extracting the content you can specify whether filler words are ignored or not.
To keep them, the environment variable `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS` must be set to false.
When using the Tika container and docker-compose, consider the following:
* See the [ocis_wopi](https://github.com/owncloud/ocis/tree/master/deployments/examples/ocis_wopi) example.

View File

@@ -9,5 +9,6 @@ type Extractor struct {
// ExtractorTika configures the Tika extractor
type ExtractorTika struct {
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
CleanStopWords bool `yaml:"clean_stop_words" env:"SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS" desc:"Defines if stop words should be cleaned or not. See the documentation for more details."`
}

View File

@@ -43,7 +43,8 @@ func DefaultConfig() *config.Config {
Type: "basic",
CS3AllowInsecure: false,
Tika: config.ExtractorTika{
TikaURL: "http://127.0.0.1:9998",
TikaURL: "http://127.0.0.1:9998",
CleanStopWords: true,
},
},
Events: config.Events{

View File

@@ -1,5 +1,15 @@
package content
import (
"strings"
"github.com/bbalet/stopwords"
)
func init() {
stopwords.OverwriteWordSegmenter(`[^ ]+`)
}
// Document wraps all resource meta fields,
// it is used as a content extraction result.
type Document struct {
@@ -11,3 +21,7 @@ type Document struct {
MimeType string
Tags []string
}
func CleanString(content, langCode string) string {
return strings.TrimSpace(stopwords.CleanString(content, langCode, true))
}

View File

@@ -0,0 +1,36 @@
package content_test
import (
"testing"
. "github.com/stretchr/testify/assert"
"github.com/owncloud/ocis/v2/services/search/pkg/content"
)
func TestCleanContent(t *testing.T) {
tests := []struct {
given string
expect string
}{
{
given: "find can keeper should keeper will",
expect: "keeper keeper",
},
{
given: "user1 shares the file to Marie",
expect: "user1 shares file marie",
},
{
given: "content contains https://localhost/remote.php/dav/files/admin/Photos/San%20Francisco.jpg and stop word",
expect: "content contains https://localhost/remote.php/dav/files/admin/photos/san%20francisco.jpg stop word",
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.given, func(t *testing.T) {
Equal(t, tc.expect, content.CleanString(tc.given, "en"))
})
}
}

View File

@@ -5,11 +5,11 @@ import (
"fmt"
"strings"
"github.com/bbalet/stopwords"
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
"github.com/cs3org/reva/v2/pkg/rgrpc/todo/pool"
"github.com/google/go-tika/tika"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
"github.com/owncloud/ocis/v2/services/search/pkg/config"
)
@@ -20,7 +20,8 @@ type Tika struct {
*Basic
Retriever
tika *tika.Client
contentExtractionSizeLimit uint64
ContentExtractionSizeLimit uint64
CleanStopWords bool
}
// NewTikaExtractor creates a new Tika instance.
@@ -41,7 +42,8 @@ func NewTikaExtractor(gatewaySelector pool.Selectable[gateway.GatewayAPIClient],
Basic: basic,
Retriever: newCS3Retriever(gatewaySelector, logger, cfg.Extractor.CS3AllowInsecure),
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
ContentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
CleanStopWords: cfg.Extractor.Tika.CleanStopWords,
}, nil
}
@@ -56,7 +58,7 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
return doc, nil
}
if ri.Size > t.contentExtractionSizeLimit {
if ri.Size > t.ContentExtractionSizeLimit {
t.logger.Info().Interface("ResourceID", ri.Id).Str("Name", ri.Name).Msg("file exceeds content extraction size limit. skipping.")
return doc, nil
}
@@ -86,8 +88,8 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
}
}
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
doc.Content = stopwords.CleanString(doc.Content, lang, true)
if langCode, _ := t.tika.LanguageString(ctx, doc.Content); langCode != "" && t.CleanStopWords {
doc.Content = CleanString(doc.Content, langCode)
}
return doc, nil

View File

@@ -11,11 +11,12 @@ import (
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/stretchr/testify/mock"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
conf "github.com/owncloud/ocis/v2/services/search/pkg/config/defaults"
"github.com/owncloud/ocis/v2/services/search/pkg/content"
contentMocks "github.com/owncloud/ocis/v2/services/search/pkg/content/mocks"
"github.com/stretchr/testify/mock"
)
var _ = Describe("Tika", func() {
@@ -48,6 +49,7 @@ var _ = Describe("Tika", func() {
cfg := conf.DefaultConfig()
cfg.Extractor.Tika.TikaURL = srv.URL
cfg.Extractor.Tika.CleanStopWords = true
var err error
tika, err = content.NewTikaExtractor(nil, log.NewLogger(), cfg)
@@ -82,7 +84,7 @@ var _ = Describe("Tika", func() {
})
It("removes stop words", func() {
body = "body to test stop words!!! I, you, he, she, it, we, you, they, stay"
body = "body to test stop words!!! against almost everyone"
language = "en"
doc, err := tika.Extract(context.TODO(), &provider.ResourceInfo{
@@ -90,7 +92,20 @@ var _ = Describe("Tika", func() {
Size: 1,
})
Expect(err).ToNot(HaveOccurred())
Expect(doc.Content).To(Equal("body test stop words i stay "))
Expect(doc.Content).To(Equal("body test stop words!!!"))
})
It("keeps stop words", func() {
body = "body to test stop words!!! against almost everyone"
language = "en"
tika.CleanStopWords = false
doc, err := tika.Extract(context.TODO(), &provider.ResourceInfo{
Type: provider.ResourceType_RESOURCE_TYPE_FILE,
Size: 1,
})
Expect(err).ToNot(HaveOccurred())
Expect(doc.Content).To(Equal(body))
})
})
})