ลอง wordcutpy บน pypy3
พอทดสอบกับไฟล์ 11MB ใช้ pypy3 ทำให้ wordcutpy เร็วขึ้นเกิน 2 เท่า! คือใช้เวลาจาก 16 วินาที เหลือไม่ถึง 8 วินาที
(base) [vee@mint310 wiki]$ python3 wordcutpy.py
16598
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7833
(base) [vee@mint310 wiki]$ python3 wordcutpy.py
16093
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7821
(base) [vee@mint310 wiki]$ python3 wordcutpy.py
16272
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7810
# wordcutpy.py
# การใช้ wordcutpy ที่ถูกต้องคือ copy & paste เลย ไม่ต้องใช้ pip 😅
# แล้วก็ copy bigthai.txt มาไว้ folder เดียวกัน
import sys
import re
class PrefixTree(object):
def __init__(self, members_with_payload):
self.tab = {}
if members_with_payload is None:
return
sorted_members_with_payload = sorted(members_with_payload,
key=lambda i: i[0])
for i in range(len(sorted_members_with_payload)):
members, payload = sorted_members_with_payload[i]
row_no = 0
for j in range(len(members)):
is_terminal = len(members) == j + 1
member = members[j]
key = (row_no, j, member)
if key in self.tab:
row_no = self.tab[key][0]
else:
val = (i, is_terminal, payload if is_terminal else None)
self.tab[key] = val
row_no = i
def lookup(self, i, offset, member):
key = (i, offset, member)
if key not in self.tab:
return None
return self.tab[key]
UNK = 1
DICT = 2
INIT = 3
LATIN = 4
PUNC = 5
def is_better(link0, link1):
if link0 is None:
return True
if link1["unk"] < link0["unk"]:
return True
if link1["w"] < link0["w"]:
return True
return False
def build_path(dix, s):
left_boundary = 0
dict_acc_list = []
path = [{"p":None, "w": 0, "unk": 0, "type": INIT}]
latin_s = None
latin_e = None
punc_s = None
punc_e = None
for i, ch in enumerate(s):
dict_acc_list.append({"s":i, "p":0, "final":False})
# Update dict acceptors
_dict_acc_list = dict_acc_list
dict_acc_list = []
for acc in _dict_acc_list:
offset = i - acc["s"]
child = dix.lookup(acc["p"], offset, ch)
if child is not None:
child_p, is_final, payload = child
dict_acc_list.append({"s":acc["s"], "p": child_p,
"final":is_final})
# latin words
if latin_s is None:
if re.match(u"[A-Za-z]", ch):
latin_s = i
if latin_s is not None:
if re.match(u"[A-Za-z]", ch):
if i + 1 == len(s) or re.match(u"[A-Za-z]", s[i + 1]):
latin_e = i
else:
latin_s = None
latin_e = None
# puncuation
if punc_s is None:
if ch == " ":
punc_s = i
if punc_s is not None:
if ch == " ":
if len(s) == i + 1 or s[i + 1] != " ":
punc_e = i
else:
punc_s = None
punc_e = None
# select link
link = None
# links from wordlist
for acc in dict_acc_list:
if acc["final"]:
p_link = path[acc["s"]]
_link = {"p": acc["s"],
"w": p_link["w"] + 1,
"unk": p_link["unk"],
"type": DICT}
if is_better(link, _link):
link = _link
# link from latin word
if latin_s is not None and latin_e is not None:
p_link = path[latin_s]
_link = {"p": latin_s,
"w": p_link["w"] + 1,
"unk": p_link["unk"],
"type": LATIN}
if is_better(link, _link):
link = _link
# link from puncuation
if punc_s is not None and punc_e is not None:
p_link = path[punc_s]
_link = {"p": punc_s,
"w": p_link["w"] + 1,
"unk": p_link["unk"],
"type": PUNC}
if is_better(link, _link):
link = _link
# fallback
if link is None:
p_link = path[left_boundary]
link = {"p": left_boundary,
"w": p_link["w"] + 1,
"unk": p_link["unk"] + 1,
"type": UNK}
path.append(link)
if link["type"] != UNK:
left_boundary = i
return path
def path_to_tokens(txt, path):
if len(path) < 2:
return None
e = len(path) - 1
toks = []
while True:
link = path[e]
s = link["p"]
if s is None:
break
toks.append(txt[s:e])
e = s
toks.reverse()
return toks
def tokenize(dix, txt):
if txt is None or txt == "":
return []
path = build_path(dix, txt)
return path_to_tokens(txt, path)
class Wordcut(object):
def __init__(self, wordlist):
self.dix = PrefixTree([(word, None) for word in wordlist])
@classmethod
def bigthai(cls):
import os
"Initialize from bigthai"
fileDir = os.path.dirname(__file__)
filename = os.path.join(fileDir, 'bigthai.txt')
with open(filename) as dict_file:
word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
word_list.sort()
return cls(word_list)
def tokenize(self, s):
return tokenize(self.dix, s)
wordcut = Wordcut.bigthai()
import time
t1 = int(round(time.time() * 1000))
with open("wiki_plain_100k.txt") as fi:
with open("wiki.cut", "w") as fo:
for line in fi:
line = line.strip()
print(" ".join(wordcut.tokenize(line)), file=fo)
t2 = int(round(time.time() * 1000))
print(t2-t1)
# LICENSE: LGPLv3