233
This commit is contained in:
65
userdict.py
Normal file
65
userdict.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import jieba
|
||||
import os
|
||||
from threading import Thread, Lock
|
||||
from queue import Queue
|
||||
|
||||
userdict = './userdict.txt'
|
||||
jieba.load_userdict(userdict)
|
||||
LOCK = Lock()
|
||||
|
||||
def readfile(filepath, encoding='utf-8'):
|
||||
# 读取文本
|
||||
with open(filepath, "rt", encoding=encoding) as fp:
|
||||
content = fp.read()
|
||||
return content
|
||||
|
||||
|
||||
def savefile(savepath, content):
|
||||
# 保存文本
|
||||
with open(savepath, "wt") as fp:
|
||||
fp.write(content)
|
||||
|
||||
def check_dir_exist(dir):
|
||||
# 坚持目录是否存在,不存在则创建
|
||||
if not os.path.exists(dir):
|
||||
os.mkdir(dir)
|
||||
|
||||
def text_segment(q):
|
||||
"""
|
||||
对一个类别目录下进行分词
|
||||
"""
|
||||
while not q.empty():
|
||||
from_dir, to_dir = q.get()
|
||||
with LOCK:
|
||||
print(from_dir)
|
||||
files = os.listdir(from_dir)
|
||||
for name in files:
|
||||
if name.startswith('.DS'):
|
||||
continue
|
||||
from_file = os.path.join(from_dir, name)
|
||||
to_file = os.path.join(to_dir, name)
|
||||
|
||||
content = readfile(from_file)
|
||||
seg_content = jieba.cut(content)
|
||||
savefile(to_file, ' '.join(seg_content))
|
||||
|
||||
|
||||
def corpus_seg(curpus_path, seg_path):
|
||||
"""对文本库分词,保存分词后的文本库,目录下以文件归类 curpus_path/category/1.txt, 保存为 seg_path/category/1.txt"""
|
||||
check_dir_exist(seg_path)
|
||||
q = Queue()
|
||||
cat_folders = os.listdir(curpus_path)
|
||||
for folder in cat_folders:
|
||||
from_dir = os.path.join(curpus_path, folder)
|
||||
to_dir = os.path.join(seg_path, folder)
|
||||
check_dir_exist(to_dir)
|
||||
|
||||
q.put((from_dir, to_dir))
|
||||
|
||||
for i in range(len(cat_folders)):
|
||||
Thread(target=text_segment, args=(q,)).start()
|
||||
if __name__ == '__main__':
|
||||
# 分词
|
||||
data_dir = './NCorpus/'
|
||||
corpus_seg(data_dir + 'train/', data_dir + 'train_seg')
|
||||
corpus_seg(data_dir + 'test/', data_dir + 'test_seg')
|
||||
Reference in New Issue
Block a user