Code Monkey home page Code Monkey logo

gse-bleve's Introduction

gse-bleve

Build Status Build Status CircleCI Status codecov Go Report Card GoDoc Release

Use

package main

import (
	"fmt"
	"os"

	"github.com/blevesearch/bleve/v2"
	gse "github.com/vcaesar/gse-bleve"
)

func main() {
	opt := gse.Option{
		Index: "test.blv",
		Dicts: "embed, ja",
		// Dicts: "embed, zh", 
		Stop: "",
		Opt: "search-hmm", 
		Trim: "trim",
		}

	index, err := gse.New(opt)
	if err != nil {
		fmt.Println("new mapping error is: ", err)
		return
	}

	text := `見解では、謙虚なヴォードヴィリアンのベテランは、運命の犠牲者と悪役の両方の変遷として代償を払っています`
	err = index.Index("1", text)
	index.Index("3", text+"浮き沈み")
	index.Index("4", `In view, a humble vaudevillian veteran cast vicariously as both victim and villain vicissitudes of fate.`)
	index.Index("2", `It's difficult to understand the sum of a person's life.`)
	if err != nil {
		fmt.Println("index error: ", err)
	}

	query := "運命の犠牲者"
	req := bleve.NewSearchRequest(bleve.NewQueryStringQuery(query))
	req.Highlight = bleve.NewHighlight()
	res, err := index.Search(req)
	fmt.Println(res, err)

	os.RemoveAll("test.blv")
}

gse-bleve's People

Contributors

vcaesar avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar

gse-bleve's Issues

How to build a binary without embedded dictionary included?

Sorry for the wrong proposal. The problem should only be related to gse-bleve.

  • Gse version (or commit ref): 0.70.1
  • Go version: 1.17
  • Operating system and bit: Ubuntu 20.04 64bit
  • Can you reproduce the bug at Examples:
    • Yes (provide example code)
    • No
    • Not relevant
  • Provide example code:
package main

import (
	"fmt"
	"os"

	"github.com/blevesearch/bleve/v2"
	gse "github.com/vcaesar/gse-bleve"
)

func main() {
	opt := gse.Option{
		Index: "test.blv",
		// Dicts: "embed, ja",
		// Dicts: "embed, zh",
		Dicts: "dict.txt",
		Stop:  "",
		Opt:   "search-hmm",
		Trim:  "trim",
	}

	index, err := gse.New(opt)
	if err != nil {
		fmt.Println("new mapping error is: ", err)
		return
	}

	text := `見解では、謙虚なヴォードヴィリアンのベテランは、運命の犠牲者と悪役の両方の変遷として代償を払っています`
	err = index.Index("1", text)
	index.Index("3", text+"浮き沈み")
	index.Index("4", `In view, a humble vaudevillian veteran cast vicariously as both victim and villain vicissitudes of fate.`)
	index.Index("2", `It's difficult to understand the sum of a person's life.`)
	if err != nil {
		fmt.Println("index error: ", err)
	}

	query := "運命の犠牲者"
	req := bleve.NewSearchRequest(bleve.NewQueryStringQuery(query))
	req.Highlight = bleve.NewHighlight()
	res, err := index.Search(req)
	fmt.Println(res, err)

	os.RemoveAll("test.blv")
}

I've tested these dictionary configurations, trying to find out the bug:

  • Using embed, zh as Dicts. 43Mb of binary generated.
  • Using dict.txt (a custom dictionary, only 3 lines) as Dicts. 43Mb of binary generated.
  • Replace gse-bleve with the default bleve. 11Mb of binary generated.

It seems that whatever the dictionary is, it would always include the embedded dictionaries in the binary file. I managed to find the bug, but I failed.

Thanks for your help.

The search result lost fragments

I test the new version,it's great,but I met a little bit trouble when I tried to import gse-bleve to my project,the search result lost fragments,I can not figure it out.Here is my code:
step 1: I make the Index

const INDEX_DIR      = "gse.bleve2"

var (
	MyIndex bleve.Index
	opt = gse.Option{
		Index: INDEX_DIR,
		Dicts: "embed, zh", Stop: "",  //
		Opt: "search-hmm",Trim:"trim"}
)

func InitIndex() bool {
	index, indexErr := bleve.Open(INDEX_DIR)
	if indexErr != nil {
		if indexErr != bleve.ErrorIndexPathDoesNotExist {
			return false
		}
		newMapping := GetIndexMapping()
		if newMapping == nil {
			return false
		}
		newIndex, newIndexErr := bleve.New(INDEX_DIR, newMapping)
		if newIndexErr != nil {
			return false
		}
		index = newIndex
	}
	MyIndex = index
	return true
}

func GetIndexMapping() *mapping.IndexMappingImpl {
	//struct各个字段
	//自定义TokerFilter
	sfileMapping := mapping.NewDocumentMapping()
	//不需要分词的字段
	// a generic reusable mapping for keyword text
	keywordFieldMapping := bleve.NewTextFieldMapping()
	keywordFieldMapping.Analyzer = keyword.Name
	keywordFieldMapping.Name = keyword.Name //好像必须使用Name?英文检索是这样的
 
	textFieldMapping := gse.NewTextMap()  //mapping.NewTextFieldMapping()

	// a generic reusable mapping for numeric
	numericFieldMapping := bleve.NewNumericFieldMapping()

	// a generic reusable mapping for datetime
	//datetimeFieldMapping := bleve.NewDateTimeFieldMapping()
	/*
		目前没有索引的字段position_id,日期字段暂时没有用到(日期转成in64)
	*/
	//分词的字段
	sfileMapping.AddFieldMappingsAt("name", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("description", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("summary", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("content", textFieldMapping)
	//关键字字段,日期的字段转成字符串,带到查询结果中
	sfileMapping.AddFieldMappingsAt("author", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("specialty", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("classify", textFieldMapping)

	sfileMapping.AddFieldMappingsAt("id", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("ext", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("input_user_id", textFieldMapping)

	sfileMapping.AddFieldMappingsAt("date_send_str", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("date_effect_str", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("date_cancel_str", textFieldMapping)
	sfileMapping.AddFieldMappingsAt("input_time_str", textFieldMapping)
	//数字字段
	sfileMapping.AddFieldMappingsAt("state", numericFieldMapping)
	sfileMapping.AddFieldMappingsAt("date_send", numericFieldMapping)
	sfileMapping.AddFieldMappingsAt("date_effect", numericFieldMapping)
	sfileMapping.AddFieldMappingsAt("date_cancel", numericFieldMapping)
	sfileMapping.AddFieldMappingsAt("input_time", numericFieldMapping)

	result,err := gse.NewMapping(opt)
	if err!=nil{
		fmt.Println("初始化出错:",err)
		return nil
	}
	result.AddDocumentMapping("myfile", sfileMapping)
	return result
}

step 2: Index some text

step3 :do search

type SearchContent struct {
	Content string `json:"content"`
	Page    int    `json:"page"`
}

func DoSearch(q query.Query, pageNo int) *bleve.SearchResult {
	req := bleve.NewSearchRequest(q)

	req.Highlight = bleve.NewHighlight()
	//[]string{"*"}返回全部字段
	req.Fields = []string{"id", "name", "summary", "description", "ext", "author",
		"specialty", "classify", "date_send_str", "date_effect_str",
		"date_cancel_str", "input_time_str", "input_user_id", "state"}
	req.Size = MyConfig.PageSize                //一次请求返回结果数量
	req.From = MyConfig.PageSize * (pageNo - 1) //返回结果起始序号
	res, err := MyIndex.Search(req)

	//fmt.Println(res.Total)
	if err != nil {
		panic(err)
	}

	return res
}

..............................
swords := &SearchContent{}
q := fmt.Sprintf("content:%s name:%s", swords.Content,swords.Content)
myquery := bleve.NewQueryStringQuery(q)
res := DoSearch(myquery, swords.Page)

..............................
Did anything I have missed?

Chinese mess code and mark offset

Thank u very much!
I go get -u the packages and found the update only fixed the first paragraph <mark> tag.if the text has several paragraphs, sometimes the <mark> maybe have some offset.Mess code still exist from the 2nd paragraph. like this one:

var text1 string = `马克思主义认为,管理具有两重性,即既有同生产力相联系的自然属性,又有同生产力相互制约的社会属性。后勤管理是与科学技术的进步、生产力的发展水平紧密联系在一起的。生产力和科学技术水平直接决定着后勤工作中财和物的管理水平以及人员素质,这是后勤管理自然属性的表现。另一方面,后勤管理又是占有生产资料的阶级用来调整阶级关系,维护本阶级利益的一种手段,具有与生产关系相联系的性质,在阶级社会中具有鲜明的阶级性。社会主义制度下的后勤管理不再体现为剥削与被剥削的关系,而体现人与人之间平等互助的客观要求,这是后勤管理的社会属性。`

result bug :
1-the first <mark> is at wrong position,there is some offset
2-mess code from the second paragraph.

Result of: '管理': 1 matches

  1. 1, (0.063785)
    content: …管理具有两重性,<mark>即既</mark>有同生产力相联系的自然属性,又有同生产力相互制约的社会属性。
    后勤管理是与科学技术的进�<mark>�、�</mark>�产力的发展水平紧密联系在一起的。生产力和科学技术水平直接决定着后勤工作中财和物的管理水平
    以及人员素质,这是后勤管理自然属性的表现。另一方面,�<mark>��勤�</mark>��理又是占有生产资料的阶级用来调整阶级关�<mark>��,�</mark>��护本阶级利益的一
    种手段,具有与生产关系相联系<mark>的性</mark>质,在阶级社会中具有鲜明的阶级性。社会主义制度…

大佬,请教问题

现在有个问题,一个mapping里,有可能有中文文本,也有可能有英文,也有可能中英文都有,现在需求是,就像MySQL 的like查找一样,筛选所有包含的,找了好多分词工具,都没达到想要的效果,您这块有啥建议吗?或者说有没有推荐的结局方案

Sent from PPHub

Update bleve to newest

can't compile

/go/pkg/mod/github.com/vcaesar/[email protected]/analyzer.go:25:9: invalid composite literal type analysis.Analyzer
/go/pkg/mod/github.com/vcaesar/[email protected]/analyzer.go:30:39: cannot use analyzerConstructor (value of type func(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error)) as type registry.AnalyzerConstructor in argument to registry.RegisterAnalyzer
/go/pkg/mod/github.com/vcaesar/[email protected]/analyzer.go:31:42: cannot use analyzerConstructor (value of type func(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error)) as type registry.AnalyzerConstructor in argument to registry.RegisterAnalyzer

Compilation finished with exit code 2

Chinese mess code

I upload some text files(utf-8/.txt),long Chinese text,read and make indexes. github.com/leopku/bleve-gse-tokenizer/v2 is ok but this return mess code,some return part mess code.

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.