英文文本预处理
单词原型
word_map = {
"i'll": "i will",
"it'll": "it will",
"we'll": "we will",
"he'll": "he will",
"they'll": "they will",
"i'd": "i would",
"we'd": "we would",
"he'd": "he would",
"they'd": "they would",
"i'm": "i am",
"he's": "he is",
"she's": "she is",
"that's": "that is",
"here's": "here is",
"there's": "there is",
"we're": "we are",
"they're": "they are",
"who's": "who is",
"what's": "what is",
"i've": "i have",
"we've": "we have",
"they've": "they have",
"wanna": "want to",
"can't": "can not",
"ain't": "are not",
"isn't": "is not",
"and/or": "and or",
}
实例
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
newsgroups = fetch_20newsgroups()
# 缩写词补全映射关系
word_map = {
"i'll": "i will",
"it'll": "it will",
"we'll": "we will",
"he'll": "he will",
"they'll": "they will",
"i'd": "i would",
"we'd": "we would",
"he'd": "he would",
"they'd": "they would",
"i'm": "i am",
"he's": "he is",
"she's": "she is",
"that's": "that is",
"here's": "here is",
"there's": "there is",
"we're": "we are",
"they're": "they are",
"who's": "who is",
"what's": "what is",
"i've": "i have",
"we've": "we have",
"they've": "they have",
"wanna": "want to",
"can't": "can not",
"ain't": "are not",
"isn't": "is not",
}
# 检验符号是否有效(字母 和 符号【'】)
def isValidChar(ch: str):
if ch.isalpha(): return True
if ch == "'": return True
# 检验单词是否有效(必须由 字母 和 符号【'】 构成)
def isValidWord(word: str):
if len(word) > 20: return False
for ch in word:
if not isValidChar(ch): return False
return True
# 处理单词
def handleWord(word: str):
if word in word_map: return word_map[word]
if len(word) > 1 and word[-2:] == "'s": return word[:-2]
return word
# 处理文本
def handleText(text: str):
text = text.lower().split()
res = [""]
for word in text:
if not isValidChar(word[-1]): word = word[:-1]
if not isValidWord(word): continue
res.append(handleWord(word))
res = " ".join(res).split()
res = [word for word in res if word not in stop]
return res
# 处理文本集合
def handleTextList(texts: list):
res = []
for i, text in enumerate(texts):
if (i + 1) % 1000 == 0:
print("\r\t正在进行过滤:{:.2f}% 共{}条".format((i + 1) * 100 / len(texts), len(texts)), end="", flush=True)
res.append(handleText(text))
print()
return res
if __name__ == '__main__':
texts = handleTextList(newsgroups['data'])
# 打印可以看到结果
# print(text[0])
20200725
import os
import re
from nltk import CoreNLPDependencyParser
import time
import joblib
def get_dataset(name):
with open(f"dataset/{name}/texts.txt", "r", encoding="utf-8") as f:
texts = f.read().strip().split("\n")
with open(f"dataset/{name}/labels.txt", "r", encoding="utf-8") as f:
labels = f.read().strip().split("\n")
return texts, labels
def clean_text(text: str) -> str:
text = text.lower()[:8000]
text = re.sub(r"[^a-z0-9(),.!@?'`]", " ", text)
text = text.replace("'m ", " am ")
text = text.replace("'s ", " is ")
text = text.replace("'ve ", " have ")
text = text.replace("n't ", " not ")
text = text.replace("'re ", " are ")
text = text.replace("'d ", " would ")
text = text.replace("'ll ", " will ")
text = text.replace("can't", "can not")
text = text.replace("ain't", "are not")
text = text.replace("isn't", "is not")
text = text.replace(",", " , ")
text = text.replace("!", " ! ")
text = text.replace("(", " ( ")
text = text.replace(")", " ) ")
text = text.replace("?", " ? ")
text = text.replace("'", " ' ")
text = text.replace("`", " ` ")
text = " ".join(text.split())
while text.find("..") != -1:
text = text.replace("..", ".")
while True:
count = 0
for c in ",!()?'`.":
while True:
if text.find(f" {c} {c} ") == -1: break
count += 1
text = text.replace(f" {c} {c} ", f" {c} ")
if count == 0: break
starts = [text.find(w) for w in "abcdefghijklmnopqrstuvwxyz0123456789"]
start = min([s for s in starts if s != -1])
ends = [text.rfind(w) for w in "abcdefghijklmnopqrstuvwxyz0123456789"]
end = max([e for e in ends if e != -1])
text = text[start:end + 1]
return text
def parse_text(text: str, nlp):
data = list(nlp.parse_text(text))
words, graph = [], [[], []]
for seq in data:
nodes = seq.nodes
num = len(words)
for i in range(1, len(nodes)):
node = nodes[i]
words.append(node['lemma'])
edges = [(i - 1 + num, n - 1 + num) for deps in node['deps'].values() for n in deps]
edges = [(s, e) for s, e in edges] + [(e, s) for s, e in edges]
edges = sorted(list(set(edges)), key=lambda x: (x[0], x[1]))
graph[0] += [s for s, e in edges]
graph[1] += [e for s, e in edges]
text = " ".join(words)
return text, graph
def parse_texts(texts, nlp):
t1 = time.time()
texts_split, graphs = [], []
print("start")
for i, t in enumerate(texts):
if (i + 1) % 100 == 0:
print(f"{i + 1}/{len(texts)} {(i + 1) * 100 / len(texts):.2f}% cost={time.time() - t1:.2f}")
t1 = time.time()
text, graph = parse_text(t, nlp)
texts_split.append(text)
graphs.append(graph)
return texts_split, graphs
names = ["20ng", "mr", "ohsumed", "R8", "R52"]
name = names[0]
texts, labels = get_dataset(name)
root = f"source/{name}"
if not os.path.exists(root):
os.mkdir(root)
texts_clean = [clean_text(t) for t in texts]
with open(f"{root}/texts_clean.txt", "w", encoding="utf-8") as f:
f.write("\n".join(texts_clean))
nlp = CoreNLPDependencyParser()
texts_split, graphs = parse_texts(texts_clean, nlp)
with open(f"{root}/texts_split.txt", "w", encoding="utf-8") as f:
f.write("\n".join(texts_split))
joblib.dump(graphs, f"source/{name}/graphs.pkl")