Files

65 lines
1.8 KiB
Python
Raw Permalink Normal View History

2023-07-11 00:02:46 -07:00
import jieba
import os
from threading import Thread, Lock
from queue import Queue
userdict = './userdict.txt'
jieba.load_userdict(userdict)
LOCK = Lock()
def readfile(filepath, encoding='utf-8'):
# 读取文本
with open(filepath, "rt", encoding=encoding) as fp:
content = fp.read()
return content
def savefile(savepath, content):
# 保存文本
with open(savepath, "wt") as fp:
fp.write(content)
def check_dir_exist(dir):
# 坚持目录是否存在,不存在则创建
if not os.path.exists(dir):
os.mkdir(dir)
def text_segment(q):
"""
对一个类别目录下进行分词
"""
while not q.empty():
from_dir, to_dir = q.get()
with LOCK:
print(from_dir)
files = os.listdir(from_dir)
for name in files:
if name.startswith('.DS'):
continue
from_file = os.path.join(from_dir, name)
to_file = os.path.join(to_dir, name)
content = readfile(from_file)
seg_content = jieba.cut(content)
savefile(to_file, ' '.join(seg_content))
def corpus_seg(curpus_path, seg_path):
"""对文本库分词,保存分词后的文本库,目录下以文件归类 curpus_path/category/1.txt, 保存为 seg_path/category/1.txt"""
check_dir_exist(seg_path)
q = Queue()
cat_folders = os.listdir(curpus_path)
for folder in cat_folders:
from_dir = os.path.join(curpus_path, folder)
to_dir = os.path.join(seg_path, folder)
check_dir_exist(to_dir)
q.put((from_dir, to_dir))
for i in range(len(cat_folders)):
Thread(target=text_segment, args=(q,)).start()
if __name__ == '__main__':
# 分词
data_dir = './NCorpus/'
corpus_seg(data_dir + 'train/', data_dir + 'train_seg')
corpus_seg(data_dir + 'test/', data_dir + 'test_seg')