mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-05-04 14:13:18 -04:00
enhancement: improve content extraction stop word cleaning (#7553)
* enhancement: improve content extraction stop word cleaning * fix: cleanup documentation Co-authored-by: Martin <github@diemattels.at> * fix: failing tika stop word unit tests --------- Co-authored-by: Martin <github@diemattels.at>
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
Enhancement: Tika content extraction cleanup for search
|
||||
|
||||
So far it has not been possible to determine whether the
|
||||
content for search should be cleaned up of 'stop words' or not.
|
||||
Stop words are filling words like "I, you, have, am" etc and
|
||||
defined by the search engine.
|
||||
|
||||
The behaviour can now be set with the newly introduced settings option `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS=false`
|
||||
which is enabled by default.
|
||||
|
||||
In addition, the stop word cleanup is no longer as aggressive and now ignores numbers, urls,
|
||||
basically everything except the defined stop words.
|
||||
|
||||
https://github.com/owncloud/ocis/pull/7553
|
||||
https://github.com/owncloud/ocis/issues/6674
|
||||
@@ -70,6 +70,9 @@ When the search service can reach Tika, it begins to read out the content on dem
|
||||
|
||||
Content extraction and handling the extracted content can be very resource intensive. Content extraction is therefore limited to files with a certain file size. The default limit is 20MB and can be configured using the `SEARCH_CONTENT_EXTRACTION_SIZE_LIMIT` variable.
|
||||
|
||||
When extracting the content you can specify whether filler words are ignored or not.
|
||||
To keep them, the environment variable `SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS` must be set to false.
|
||||
|
||||
When using the Tika container and docker-compose, consider the following:
|
||||
|
||||
* See the [ocis_wopi](https://github.com/owncloud/ocis/tree/master/deployments/examples/ocis_wopi) example.
|
||||
|
||||
@@ -9,5 +9,6 @@ type Extractor struct {
|
||||
|
||||
// ExtractorTika configures the Tika extractor
|
||||
type ExtractorTika struct {
|
||||
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
|
||||
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
|
||||
CleanStopWords bool `yaml:"clean_stop_words" env:"SEARCH_EXTRACTOR_TIKA_CLEAN_STOP_WORDS" desc:"Defines if stop words should be cleaned or not. See the documentation for more details."`
|
||||
}
|
||||
|
||||
@@ -43,7 +43,8 @@ func DefaultConfig() *config.Config {
|
||||
Type: "basic",
|
||||
CS3AllowInsecure: false,
|
||||
Tika: config.ExtractorTika{
|
||||
TikaURL: "http://127.0.0.1:9998",
|
||||
TikaURL: "http://127.0.0.1:9998",
|
||||
CleanStopWords: true,
|
||||
},
|
||||
},
|
||||
Events: config.Events{
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
package content
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/bbalet/stopwords"
|
||||
)
|
||||
|
||||
func init() {
|
||||
stopwords.OverwriteWordSegmenter(`[^ ]+`)
|
||||
}
|
||||
|
||||
// Document wraps all resource meta fields,
|
||||
// it is used as a content extraction result.
|
||||
type Document struct {
|
||||
@@ -11,3 +21,7 @@ type Document struct {
|
||||
MimeType string
|
||||
Tags []string
|
||||
}
|
||||
|
||||
func CleanString(content, langCode string) string {
|
||||
return strings.TrimSpace(stopwords.CleanString(content, langCode, true))
|
||||
}
|
||||
|
||||
36
services/search/pkg/content/content_test.go
Normal file
36
services/search/pkg/content/content_test.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package content_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/owncloud/ocis/v2/services/search/pkg/content"
|
||||
)
|
||||
|
||||
func TestCleanContent(t *testing.T) {
|
||||
tests := []struct {
|
||||
given string
|
||||
expect string
|
||||
}{
|
||||
{
|
||||
given: "find can keeper should keeper will",
|
||||
expect: "keeper keeper",
|
||||
},
|
||||
{
|
||||
given: "user1 shares the file to Marie",
|
||||
expect: "user1 shares file marie",
|
||||
},
|
||||
{
|
||||
given: "content contains https://localhost/remote.php/dav/files/admin/Photos/San%20Francisco.jpg and stop word",
|
||||
expect: "content contains https://localhost/remote.php/dav/files/admin/photos/san%20francisco.jpg stop word",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.given, func(t *testing.T) {
|
||||
Equal(t, tc.expect, content.CleanString(tc.given, "en"))
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -5,11 +5,11 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/bbalet/stopwords"
|
||||
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
|
||||
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
|
||||
"github.com/cs3org/reva/v2/pkg/rgrpc/todo/pool"
|
||||
"github.com/google/go-tika/tika"
|
||||
|
||||
"github.com/owncloud/ocis/v2/ocis-pkg/log"
|
||||
"github.com/owncloud/ocis/v2/services/search/pkg/config"
|
||||
)
|
||||
@@ -20,7 +20,8 @@ type Tika struct {
|
||||
*Basic
|
||||
Retriever
|
||||
tika *tika.Client
|
||||
contentExtractionSizeLimit uint64
|
||||
ContentExtractionSizeLimit uint64
|
||||
CleanStopWords bool
|
||||
}
|
||||
|
||||
// NewTikaExtractor creates a new Tika instance.
|
||||
@@ -41,7 +42,8 @@ func NewTikaExtractor(gatewaySelector pool.Selectable[gateway.GatewayAPIClient],
|
||||
Basic: basic,
|
||||
Retriever: newCS3Retriever(gatewaySelector, logger, cfg.Extractor.CS3AllowInsecure),
|
||||
tika: tika.NewClient(nil, cfg.Extractor.Tika.TikaURL),
|
||||
contentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
|
||||
ContentExtractionSizeLimit: cfg.ContentExtractionSizeLimit,
|
||||
CleanStopWords: cfg.Extractor.Tika.CleanStopWords,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -56,7 +58,7 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
if ri.Size > t.contentExtractionSizeLimit {
|
||||
if ri.Size > t.ContentExtractionSizeLimit {
|
||||
t.logger.Info().Interface("ResourceID", ri.Id).Str("Name", ri.Name).Msg("file exceeds content extraction size limit. skipping.")
|
||||
return doc, nil
|
||||
}
|
||||
@@ -86,8 +88,8 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
|
||||
}
|
||||
}
|
||||
|
||||
if lang, _ := t.tika.LanguageString(ctx, doc.Content); lang != "" {
|
||||
doc.Content = stopwords.CleanString(doc.Content, lang, true)
|
||||
if langCode, _ := t.tika.LanguageString(ctx, doc.Content); langCode != "" && t.CleanStopWords {
|
||||
doc.Content = CleanString(doc.Content, langCode)
|
||||
}
|
||||
|
||||
return doc, nil
|
||||
|
||||
@@ -11,11 +11,12 @@ import (
|
||||
provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"github.com/stretchr/testify/mock"
|
||||
|
||||
"github.com/owncloud/ocis/v2/ocis-pkg/log"
|
||||
conf "github.com/owncloud/ocis/v2/services/search/pkg/config/defaults"
|
||||
"github.com/owncloud/ocis/v2/services/search/pkg/content"
|
||||
contentMocks "github.com/owncloud/ocis/v2/services/search/pkg/content/mocks"
|
||||
"github.com/stretchr/testify/mock"
|
||||
)
|
||||
|
||||
var _ = Describe("Tika", func() {
|
||||
@@ -48,6 +49,7 @@ var _ = Describe("Tika", func() {
|
||||
|
||||
cfg := conf.DefaultConfig()
|
||||
cfg.Extractor.Tika.TikaURL = srv.URL
|
||||
cfg.Extractor.Tika.CleanStopWords = true
|
||||
|
||||
var err error
|
||||
tika, err = content.NewTikaExtractor(nil, log.NewLogger(), cfg)
|
||||
@@ -82,7 +84,7 @@ var _ = Describe("Tika", func() {
|
||||
})
|
||||
|
||||
It("removes stop words", func() {
|
||||
body = "body to test stop words!!! I, you, he, she, it, we, you, they, stay"
|
||||
body = "body to test stop words!!! against almost everyone"
|
||||
language = "en"
|
||||
|
||||
doc, err := tika.Extract(context.TODO(), &provider.ResourceInfo{
|
||||
@@ -90,7 +92,20 @@ var _ = Describe("Tika", func() {
|
||||
Size: 1,
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(doc.Content).To(Equal("body test stop words i stay "))
|
||||
Expect(doc.Content).To(Equal("body test stop words!!!"))
|
||||
})
|
||||
|
||||
It("keeps stop words", func() {
|
||||
body = "body to test stop words!!! against almost everyone"
|
||||
language = "en"
|
||||
|
||||
tika.CleanStopWords = false
|
||||
doc, err := tika.Extract(context.TODO(), &provider.ResourceInfo{
|
||||
Type: provider.ResourceType_RESOURCE_TYPE_FILE,
|
||||
Size: 1,
|
||||
})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(doc.Content).To(Equal(body))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user