ckiplab / ckiptagger Goto Github PK

View Code? Open in Web Editor NEW

1.6K 1.6K 192.0 74 KB

CKIP Neural Chinese Word Segmentation, POS Tagging, and NER

License: GNU General Public License v3.0

Python 100.00%

ckiptagger's People

Contributors

Stargazers

Watchers

Forkers

jen-yuan ag027592 ricalhsieh graysonchiang cyang-el kevintsai sumhncku leynard007 pei-eng juilin t510599 jack000095 brchiu liuscott jerrywang1974 hakanaku1234 bowwowxx xzm2004260 qazxsw99 zuxfoucault haway matishsiao cancerpio zhengda leekltw ihon hsuanshao qcl danielliang blueicesir benknightdark nizvoo pominx barry800414 kangilin jingmouren chungili chuanghs constantia-explor facexiang swallowcc sumeragigyou cartertsai aiedward outshaker beemolin bigodatamining wackyjazz play3577 descent johnroyer sandaniel chrisudn maxware0715 iceboxi hugkoala saigyouyou fpgz6 traditionalchinesecomputing wangrory pickonefish hankwumomo yenho shenfahsu lwhsu jibanli williamweng0609 jackzhc911 464893214 hhy5277 stealthmode2 simolx sfchan-next mhbai alvinccyu ccw88u 662d7 orlys jackhou georgelkk dongcin orangelu kaziahosunhabibripon terrytsai1211 dylan-jiang from1900 caoshunwu sunny19970518 0xflotus linhr000 jinglin-udn wibruce evitrea alan5281 bergloo jgxxjg frsnic fogdingding horowolf markhsia

ckiptagger's Issues

WS coerce_dictionary parameter do not shared with NER

NER 不能用 WS coerce_dictionary 去分詞, 有沒有 fix / work around ?

Nb | 專有名詞
Nc | 地方詞

input (from README example)

sentence_list = ['瑞士 LAURASTAR S4a 熨燙護理系統', ....]

word_to_weight = {
    "瑞士 LAURASTAR": 1,
    }

# ws
word_sentence_list = ws(
    sentence_list,
    coerce_dictionary = dictionary1, 
)

# pos
pos_sentence_list = pos(word_sentence_list)

# ner
entity_sentence_list = ner(word_sentence_list, pos_sentence_list)

# Print result
print(word_sentence_list[1], pos_sentence_list[1])
for i, sentence in enumerate(sentence_list):
    print()
    print(f"'{sentence}'")
    print_word_pos_sentence(word_sentence_list[i],  pos_sentence_list[i])
    for entity in sorted(entity_sentence_list[i]):
        print(entity)

output

# without coerce_dictionary parameter
'瑞士 LAURASTAR S4a 熨燙護理系統'
['瑞士(Nc)', ' LAURASTAR S4(FW)', 'a (FW)', '熨燙(VC)', '護理(Na)', '系統(Na)']
(0, 2, 'PERSON', '瑞士')

# with coerce_dictionary parameter
'瑞士 LAURASTAR S4a 熨燙護理系統'
瑞士 LAURASTAR(Nb)　 S4(FW)　a (FW)　熨燙(VC)　護理(Na)　系統(Na)　
(0, 2, 'PERSON', '瑞士')

import gives cannot import name 'data_utils' from partially initialized module 'ckiptagger'

請問這可能可以怎麼解決，使用readme中的標準安裝 pip install -U ckiptagger[tf,gdown]

AttributeError: 'POS' object has no attribute 'model' numpy.core._exceptions._ArrayMemoryError:

Code
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
data_utils.download_data_gdown("./")
ws = WS("./data")
pos = POS("./data")
ner = NER("./data")
sentence_list = ["""傅達仁今將執行安樂死，卻突然爆出自己20年前遭緯來體育台封殺，
他不懂自己哪裡得罪到電視台。",
"美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽
證會，預料她將會很順利通過參議院支持，成為該國有史以來第一
位的華裔女性內閣成員。",
"",
"土地公有政策?？還是土地婆有政策。.",
"… 你確定嗎… 不要再騙了……",
"最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結
論.",
"科長說:1,坪數對人數為1:3。2,可以再增加。"""]

word_s = ws(sentence_list,
sentence_segmentation=True,
segment_delimiter_set={'?', '？', '!', '！', '。', ',',
'，', ';', ':', '、'})

word_p = pos(word_s)

print(word_p)

Getting this Error

Traceback (most recent call last):
File "c:\Users\Administrator\Downloads...\python_file.py", line 4, in
pos = POS("./data")
^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\envs\myenv\Lib\site-packages\ckiptagger\api.py", line 160, in init
config.w_token_to_vector, config.w_embedding_d = _load_embedding(os.path.join(data_dir, "embedding_word"))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\envs\myenv\Lib\site-packages\ckiptagger\api.py", line 357, in _load_embedding
vector_list = np.load(vector_file)
^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\envs\myenv\Lib\site-packages\numpy\lib\npyio.py", line 432, in load
return format.read_array(fid, allow_pickle=allow_pickle,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\envs\myenv\Lib\site-packages\numpy\lib\format.py", line 790, in
read_array
array = numpy.fromfile(fp, dtype=dtype, count=count)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 1.52 GiB for an array with shape (406737300,) and data type float32
Exception ignored in: <function POS.del at 0x0000020709DE6340>
Traceback (most recent call last):
File "C:\ProgramData\anaconda3\envs\myenv\Lib\site-packages\ckiptagger\api.py", line 185, in del
self.model.sess.close()
^^^^^^^^^^
AttributeError: 'POS' object has no attribute 'model'

AttributeError: 'NER' object has no attribute 'model'

Traceback (most recent call last):
File "test.py", line 6, in
ner = NER("./data")
File "/home/xijiz/.local/lib/python3.6/site-packages/ckiptagger/api.py", line 241, in init
config.w_token_to_vector, config.w_embedding_d = _load_embedding(os.path.join(data_dir, "embedding_word"))
File "/home/xijiz/.local/lib/python3.6/site-packages/ckiptagger/api.py", line 328, in _load_embedding
vector_list = np.load(vector_file)
File "/home/xijiz/.local/lib/python3.6/site-packages/numpy/lib/npyio.py", line 453, in load
pickle_kwargs=pickle_kwargs)
File "/home/xijiz/.local/lib/python3.6/site-packages/numpy/lib/format.py", line 738, in read_array
array = numpy.fromfile(fp, dtype=dtype, count=count)
numpy.core._exceptions.MemoryError: Unable to allocate array with shape (406737300,) and data type float32
Exception ignored in: <bound method NER.del of <ckiptagger.api.NER object at 0x7fb03ba2f358>>
Traceback (most recent call last):
File "/home/xijiz/.local/lib/python3.6/site-packages/ckiptagger/api.py", line 258, in del
self.model.sess.close()

Add support for tensorflow v2 or above?

您好：最近在嘗試使用這個套件，在分析文本階段會不斷出現下列錯誤：
AttributeError: 'WS' object has no attribute 'model'
及
AttributeError: module 'tensorflow' has no attribute 'variable_scope'
等等的錯誤。
在查詢後發現我使用的 tensorflow 版本為 2.3.0 ，和 README 中提及的版本需求不符（貌似 2 以上都會有這個問題，參考 #17 (comment) ），因此決定降版。但在過程中發現最低只能降到 2.2.0，才發現原來是 Python 3.8 不支援 tensorflow 2.2 之前的版本（ref）。雖然降板可能可以解決此問題，但還是希望日後或許能增加對新版 tensorflow 的支援（或期望 Python 3.8 能用舊版的 tensorflow 了）

请问这个工程有支持实体关系抽取的计划吗？

您好，非常感谢你们能开源这样好的工程。
请问这个项目将来会支持实体关系抽取吗？

the source code from pypi doesn't work on winows

i cant build from source code on windows.

but it worked fine on linux and mac.

CC license not for software and source code

FAQ from CC website:

Unlike software-specific licenses, CC licenses do not contain specific terms about the distribution of source code, which is often important to ensuring the free reuse and modifiability of software. Many software licenses also address patent rights, which are important to software but may not be applicable to other copyrightable works. Additionally, our licenses are currently not compatible with the major software licenses, so it would be difficult to integrate CC-licensed work with other free software. Existing software licenses were designed specifically for use with software and offer a similar set of rights to the Creative Commons licenses.

You should consider another license that design for software.

關於斷詞評估指標 evaluation metrics

老師們您好,
我想請問內文指出在 ASBC 4.0 Test Split (50,000 sentences) 之分數

是以(1)每一個句子斷詞後之起止區間與ground truth之起止區間所計算獲得之f1 ,加總50000句之f1後的平均

或是以(2)50000句串聯並斷完詞後之起止區間與ground truth之起止區間所計算獲得之f1
或是以上皆非正確計算方式
深怕自己有理解上的錯誤,想請教正確的實作方式,萬分感謝撥空閱讀

Any roadmap for releasing training code?

We want to train model for Hakka and Taioanoe. Will CKIP release the training code?
Or any suggestion?

Anaconda/Python 3.6.9 觀察到 warning message

在使用 CKIPTagger 時發現在某些環境下出現 warning message, 我使用 Anaconda/Jupyter notebook/Python 3.6.9/Tensorflow 1.14.0, 程式還是可以跑出結果, 不確定是否 critical.

WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E39465D668>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E39465D668>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3934BFF28>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3934BFF28>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E38F4799B0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E38F4799B0>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E394676AC8>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E394676AC8>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x000001E389F09C18>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x000001E389F09C18>>: AssertionError: Bad argument number for Name: 3, expecting 4
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E39A1A2048>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E39A1A2048>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3A624BFD0>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3A624BFD0>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3A6267048>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3A6267048>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3A628AF98>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method LSTMCell.call of <tensorflow.python.ops.rnn_cell_impl.LSTMCell object at 0x000001E3A628AF98>>: AttributeError: module 'gast' has no attribute 'Num'
WARNING: Entity <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x000001E39A155198>> could not be transformed and will be executed as-is. Please report this to the AutgoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <bound method Dense.call of <tensorflow.python.layers.core.Dense object at 0x000001E39A155198>>: AssertionError: Bad argument number for Name: 3, expecting 4

請問一下，為甚麼斷詞結果是一個字一個字的

我使用
pip install -U ckiptagger 安裝

"""
from ckiptagger import data_utils, construct_dictionary, WS

data_utils.download_data_gdown("./")
ws = WS("./data")
ckip_tok = ws("老師你好")
"""

出來的結果是 :
[['老'], ['師'], ['你'], ['好']]

想請問我該怎麼改善這個問題

python 版本為 3.8.10
ckip 版本為 0.2.1

python 版本可能需要建議在 3.6 以上

感謝貴項目的開源以及開放模型下載！非常好用！

用 python 3.5 run from ckiptagger import data_utils, construct_dictionary, WS, POS, NER 的時候會提示 f-string 錯誤：

將 python 版本升級到 >=3.6 即可解決。

可能需要在 Requirements 裡面只建議用戶使用 python >= 3.6 版本。

系統最低同理想要求

我在本機執行和展示網站比較，相同文本內容在本機執行需要6至7秒，而展示網站只需不足1秒
https://ckip.iis.sinica.edu.tw/service/ckiptagger/

本機為i7 6th gen, 16GB DDR3 1600MHz, GeForce 940M
以上6至7秒為CPU tensorflow，載入模型檔後，python3 的RAM使用大約為4GB。
如果執行tensorflow-gpu需時13秒，大概是因為GeForce 940M只有2GB Memory。

因為正在制作一個小系統專作斷詞和實體辨識，為另一系統服務，想為此小系統設計適當硬件配備。想法是使用GPU，應該會比CPU執行快，只是上述弱配置結果是GPU執得比較慢，得不出預期結果。

請問要做到展示網站的速度，需要大約怎樣的配置？

Allocation of xxx exceeds 10% of system memory

執行的程式： https://github.com/kiang/bribes_data/blob/master/03_ckip.py
輸入的檔案(JFULL 欄位)： https://github.com/kiang/bribes_data/blob/master/filter/200610/%E8%87%BA%E7%81%A3%E9%AB%98%E7%AD%89%E6%B3%95%E9%99%A2%E8%87%BA%E4%B8%AD%E5%88%86%E9%99%A2%E5%88%91%E4%BA%8B/TCHM%2C95%2C%E9%81%B8%E4%B8%8A%E8%A8%B4%2C1051%2C20061025%2C1.json

找了一下網路的說明，需要調整批次的大小，不知道一般會建議怎麼做？

安裝完後tensorflow 為 2.16 版，執行出錯，錯誤為 `tf.nn.rnn_cell.LSTMCell` is deprecated.

使用官方文件安裝方式 pip install -U ckiptagger[tf,gdown]
tensorflow 會安裝最新版本，但執行程式時會出現錯誤, 原因為 tf.nn.rnn_cell.LSTMCell is deprecated.

請問是否可以提供停用詞表

您好：

由於網絡上用於繁體中文分詞的stopwords較少見，請問是否可以提供用於分詞的停用詞表？

十分感謝

請問要如何利用model_ner訓練自己的資料集

您好
請問要如何利用model_ner訓練自己的資料集

misspelled `download` as `downlaod` in function `data_utils.downlaod_data_gdown()`

ckiptagger/src/data_utils.py

Lines 5 to 17 in 9f9aff9

    
           def downlaod_data_gdown(path): 
        
               import gdown 
        
               file_id = "1efHsY16pxK0lBD2gYCgCTnv1Swstq771" 
        
               url = f"https://drive.google.com/uc?id={file_id}" 
        
               data_zip = os.path.join(path, "data.zip") 
        
               gdown.download(url, data_zip, quiet=False) 
        
               with zipfile.ZipFile(data_zip, "r") as zip_ref: 
        
                   zip_ref.extractall(path) 
        
               return 
        
           download_data = downlaod_data_gdown

As you can see at line 5 and line 17, you just misspelled download as downlaod.

請問有沒有ckiptagger的 ElasticSearch PlugIn

各位，你好，我們想把這作為ElasticSearch的斷詞工具，請問是否像結巴一樣要寫成PlugIn呢？而不知市面上有沒有人已開發好的呢？

'WS' object has no attribute 'model'

如題，我在執行github上的demo.py時，總是會出現這個錯誤訊息，想知道有什麼解決辦法

import data_utils with AttributeError

Dependency: tensorflow 1.4.0
Using syntax 'from ckiptagger import data_utils'
Raise up an AttributeError: module 'tensorflow.python.util.compat' has no attribute 'v1'

python 版本符合安裝條件但是無法安裝

使用 3.9 的 python，但報錯 python 必須要 > 3.7 的問題，請問有人遇到一樣的問題嗎？

Can not install with "pip3 install -U ckiptagger[tf,gdown]"

I tried to install with "ckiptagger[tf,gdown]", but it returns "no matches found: ckiptagger[tf,gdown]", but when I just install "ckiptagger" without buckets, it works fine, please help.

tf.logging.info失效 !!

我安裝了ckiptagger... (tensorflow 1.14)
pip install -U ckiptagger

只要加上
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER

就無法使用 tf.logging.info 來顥示訊息!!

請問是否有繁體中文character based ner工具？

你好：

我想請問一下ckiptagger是否可以做character based pos and ner？

請教當前做繁體中文效果比較好的char base模型有哪些呢？

感謝

請問這是基於哪一個版本的 TensorFlow 設計 ?

我在 tf 1.12 下安裝之後 import 會跳錯
module 'tensorflow._api.v1.compat' has no attribute 'v1'

在 tf2.0 下可以import，但不能載入模型，載入也會跳錯
module 'tensorflow' has no attribute 'variable_scope'

請問我可以怎麼解決它 ?

Does it support to run on Windows 10?

Do you test this package on Windows?

POS tagging

I've tried the following example as input:

      這些語辭都含有高調音

這些(Neqa)　語辭(Na)　都(D)　含有(VJ)　高(VH)　調音(VA)

With customized dictionary, it was able to tag 高調音 as Na.

word_to_weight = {
"高調音": 1,
"土地公": 1,
"土地婆": 1,
"公有": 2,
"": 1,
"來亂的": "啦",
"緯來體育台": 1,
}

word_sentence_list = ws(sentence_list, recommend_dictionary=dictionary)

Is there any code or paper describe how data (token_list.npy, vector_list.np, model_pos, etc) were trained/created?

Thanks.

construct_dictionary() 權重設定無效

範例中設定權重字典為：
word_to_weight = {"土地公": 1,"土地婆": 1,"公有": 2,"": 1,"來亂的": "啦","緯來體育台": 1,}

應該是希望可將 "土地公有政策?？還是土地婆有政策。" 這句話斷成
['土地', '公有', '政策' ...]

但是定義好字典後，斷出來仍是
['土地公', '有', '政策' ...]

請問問題出在哪裡呢？
（ps: 範例的字典也怪怪的，"來亂的": "啦" 跑掉了）

gdown 好像不能用了

如題，我執行的時候會有這個 error 出現

Permission denied: https://drive.google.com/uc?id=1efHsY16pxK0lBD2gYCgCTnv1Swstq771
Maybe you need to change permission over 'Anyone with the link'?

嘗試使用NER時出現assertion error

按照 README 建置 pipeline 時，出現以下錯誤，請問應該如何解決

AssertionError                            Traceback (most recent call last)
<ipython-input-37-0e6a5bf68427> in <module>
----> 1 ner_list = ner( sent_list, pos_list )

D:\python3.7\lib\site-packages\ckiptagger\api.py in __call__(self, word_sentence_list, pos_sentence_list, character_normalization, batch_sentences, batch_characters)
    306                 word_sentence = [unicodedata.normalize("NFKD", raw_word) for raw_word in raw_word_sentence]
    307                 sentence, normal_to_raw_index = _normalize_sentence("".join(raw_word_sentence))
--> 308                 assert sentence == "".join(word_sentence)
    309                 word_sentence_list.append(word_sentence)
    310                 normal2raw_list.append(normal_to_raw_index)

AssertionError:

pandas和ckiptagger同时导入会报错

pandas和ckiptagger同时导入会报错，请问这个bug要怎么解决

M1 mac 使用不了

在跑的時候，CMD跳出下面這行

F tensorflow/core/framework/tensor.cc:681] Check failed: IsAligned()

想問有沒有方法可以解？

POS Tags 中沒有狀聲詞

狀聲詞沒辦法被歸類在同個類別

>>> pos(["汪汪"])
[['Nb', 'Nb']]
>>> pos(["咩咩"])
[['I', 'I']]
>>> pos(["吱吱"])
[['D', 'D']]
>>> pos(["吱吱喳喳"])
[['D', 'D', 'D', 'D']]
>>> pos(["哞哞"])
[['I', 'D']]

可否相容於 tensorflow 1.5?

大概是某些硬體限制，導致無法使用新版本的 tensorflow，似乎目前的 workaround 只有 downgrade 到版本 1.5，所以想問說現在這工具是否可以在該版本運作呢？

Add full-width Chinese punctuations into the default sentence delimiter set.

Most full-width Chinese punctuations are not included in the default sentence delimiter set.

Please consider to add ，：？！； into this set.

請問利用自定義字典影響斷詞結果，為什麼不會影響實體辨識結果呢？

在斷詞過程中，可加上自定義字典影響斷詞結果
例如以下列方式斷詞：

text = '即起至4月14日在官邸進行居家隔離，暫停所有公開行程；其後再進行7天的自主健康管理。'
ws_list = ws([text], coerce_dictionary=construct_dictionary({'7天的自主健康管理': 2}))
pos_list = pos(ws_list)

可得理想斷詞結果如下

即(D)　起(Ng)　至(Caa)　4月(Nd)　14日(Nd)　在(P)　官邸(Nc)　進行(VC)　居家(Na)　隔離(VC)　，(COMMACATEGORY)　暫停(VHC)　所有(Neqa)　公開(VHC)　行程(Na)　；(SEMICOLONCATEGORY)　其後(Ncd)　再(D)　進行(VC)　7天的自主健康管理(Na)　。(PERIODCATEGORY)

但以此斷詞結果進行實體辨識

entity_list = ner(ws_list, pos_list)

實體辨識結果卻未參考加上自定義字典的斷詞結果，而呈現以下結果：

[('7天', 'DATE', (32, 34)), ('4月14日', 'DATE', (3, 8))]

請問為什麼加上自定義字典的斷詞結果，無法影響實體辨識結果呢？
我的理解是斷詞結果中成功斷出「7天的自主健康管理」，實體辨識過程應以「7天的自主健康管理」進行實體辨識，而非仍以未參考自定義字典所會斷出的「7天」進行實體辨識，因此提出此問題，希望您可以撥空回應，謝謝！

recommend_dictionary 和 coerce_dictionary 有何不同？

請問recommend_dictionary 和 coerce_dictionary 有何不同，對斷詞的結果有何影響？
是否有相關的範例可以說明？

AssertionError while running NER

您好，參考官網的範例寫了一個 get_nlp_result function，會 iterate data_df 的 row，將文字資料 row[text_col] 依序送進 ws、pos 和 ner function 處理

在跑ner的時候有時會遇到 AssertionError error，請問這是甚麼問題造成的呢？

def get_nlp_result(data_df, id_col, text_col):
    start = time.time()

    pos_list = []
    entity_list = []
    sentence_list = []
    
    for index, row in data_df.iterrows(): # document level    
#         print(f"\ndocument {index}") 

        # clean data
        result = [] 
        tmp = Sentence_Segmentation(row[text_col]) 
        flat_list = [item for sublist in tmp for item in sublist]

        # ckip
        w_sentence_list = ws(flat_list, coerce_dictionary = dictionary2) # set dictionary 
        pos_sentence_list = pos(w_sentence_list)
        entity_sentence_list = ner(w_sentence_list, pos_sentence_list)

        for i, sentence in enumerate(flat_list): # sentence level
#             print(f"sentence {i}: {sentence}")
            sentence_list.append([row[id_col],sentence])            
            temp_tokens = get_pos(row[id_col],w_sentence_list[i],  pos_sentence_list[i])
            temp_entites = get_ner(row[id_col],entity_sentence_list[i])

            pos_list.append(temp_tokens)
            if len(temp_entites) != 0:
                entity_list.append(temp_entites)
            
    pos_flat = [item for sublist in pos_list for item in sublist]
    entity_flat = [item for sublist in entity_list for item in sublist]

    pos_table = pd.DataFrame(data=pos_flat, 
                    columns=[id_col,'word','pos'])        
    
    entity_table = pd.DataFrame(data=entity_flat, 
                        columns=[id_col,'word','ner']) 

    sentence_table = pd.DataFrame(data=sentence_list, 
                    columns=[id_col,'sentence']) 

    end = time.time()
    print("time costing: {}".format(end - start))

    return pos_table, entity_table, sentence_table

	def downlaod_data_gdown(path):
	import gdown

	file_id = "1efHsY16pxK0lBD2gYCgCTnv1Swstq771"
	url = f"https://drive.google.com/uc?id={file_id}"
	data_zip = os.path.join(path, "data.zip")
	gdown.download(url, data_zip, quiet=False)

	with zipfile.ZipFile(data_zip, "r") as zip_ref:
	zip_ref.extractall(path)
	return

	download_data = downlaod_data_gdown