-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
69 lines (55 loc) · 1.6 KB
/
utils.py
File metadata and controls
69 lines (55 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import jieba
import random
import torch
import numpy as np
from pathlib import Path
from ckiptagger import data_utils
from termcolor import colored
DATA_PATH = Path(__file__).parent / "data"
EXPORT_PATH = Path(__file__).parent / "export"
def set_seed(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if torch.cuda.is_available():
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def set_up():
DATA_PATH.mkdir(parents=True, exist_ok=True)
EXPORT_PATH.mkdir(parents=True, exist_ok=True)
ckip_check()
set_seed(42)
def ckip_check():
check_list = [
"embedding_character",
"embedding_word",
"model_ner",
"model_pos",
"model_ws",
]
check = True
for i in check_list:
data_exists = (DATA_PATH / i).exists()
print(
(
colored(data_exists, "blue")
if data_exists
else colored(data_exists, "red")
),
i,
)
if not data_exists:
check = False
if not check:
print("Lack of CKIP data, Start download...")
data_utils.download_data_gdown("./")
print("CKIP Data download complete.")
return
print("CKIP Data validation complete.")
def clean_text(text):
stoptext = open(DATA_PATH / "stopword.txt", encoding="utf-8").read()
stopwords = stoptext.split("\n")
words = jieba.lcut(text)
words = [w for w in words if w not in stopwords]
return " ".join(words)