65 lines
1.8 KiB
Python
65 lines
1.8 KiB
Python
|
|
import jieba
|
||
|
|
import os
|
||
|
|
from threading import Thread, Lock
|
||
|
|
from queue import Queue
|
||
|
|
|
||
|
|
userdict = './userdict.txt'
|
||
|
|
jieba.load_userdict(userdict)
|
||
|
|
LOCK = Lock()
|
||
|
|
|
||
|
|
def readfile(filepath, encoding='utf-8'):
|
||
|
|
# 读取文本
|
||
|
|
with open(filepath, "rt", encoding=encoding) as fp:
|
||
|
|
content = fp.read()
|
||
|
|
return content
|
||
|
|
|
||
|
|
|
||
|
|
def savefile(savepath, content):
|
||
|
|
# 保存文本
|
||
|
|
with open(savepath, "wt") as fp:
|
||
|
|
fp.write(content)
|
||
|
|
|
||
|
|
def check_dir_exist(dir):
|
||
|
|
# 坚持目录是否存在,不存在则创建
|
||
|
|
if not os.path.exists(dir):
|
||
|
|
os.mkdir(dir)
|
||
|
|
|
||
|
|
def text_segment(q):
|
||
|
|
"""
|
||
|
|
对一个类别目录下进行分词
|
||
|
|
"""
|
||
|
|
while not q.empty():
|
||
|
|
from_dir, to_dir = q.get()
|
||
|
|
with LOCK:
|
||
|
|
print(from_dir)
|
||
|
|
files = os.listdir(from_dir)
|
||
|
|
for name in files:
|
||
|
|
if name.startswith('.DS'):
|
||
|
|
continue
|
||
|
|
from_file = os.path.join(from_dir, name)
|
||
|
|
to_file = os.path.join(to_dir, name)
|
||
|
|
|
||
|
|
content = readfile(from_file)
|
||
|
|
seg_content = jieba.cut(content)
|
||
|
|
savefile(to_file, ' '.join(seg_content))
|
||
|
|
|
||
|
|
|
||
|
|
def corpus_seg(curpus_path, seg_path):
|
||
|
|
"""对文本库分词,保存分词后的文本库,目录下以文件归类 curpus_path/category/1.txt, 保存为 seg_path/category/1.txt"""
|
||
|
|
check_dir_exist(seg_path)
|
||
|
|
q = Queue()
|
||
|
|
cat_folders = os.listdir(curpus_path)
|
||
|
|
for folder in cat_folders:
|
||
|
|
from_dir = os.path.join(curpus_path, folder)
|
||
|
|
to_dir = os.path.join(seg_path, folder)
|
||
|
|
check_dir_exist(to_dir)
|
||
|
|
|
||
|
|
q.put((from_dir, to_dir))
|
||
|
|
|
||
|
|
for i in range(len(cat_folders)):
|
||
|
|
Thread(target=text_segment, args=(q,)).start()
|
||
|
|
if __name__ == '__main__':
|
||
|
|
# 分词
|
||
|
|
data_dir = './NCorpus/'
|
||
|
|
corpus_seg(data_dir + 'train/', data_dir + 'train_seg')
|
||
|
|
corpus_seg(data_dir + 'test/', data_dir + 'test_seg')
|