python版本-文本分类流程-英文文本预处理_python for i, text in enumerate(texts):-CSDN博客

本文链接：https://blog.csdn.net/qq_28969139/article/details/105301616

英文文本预处理

单词原型

word_map = {
    "i'll": "i will",
    "it'll": "it will",
    "we'll": "we will",
    "he'll": "he will",
    "they'll": "they will",
    "i'd": "i would",
    "we'd": "we would",
    "he'd": "he would",
    "they'd": "they would",
    "i'm": "i am",
    "he's": "he is",
    "she's": "she is",
    "that's": "that is",
    "here's": "here is",
    "there's": "there is",
    "we're": "we are",
    "they're": "they are",
    "who's": "who is",
    "what's": "what is",
    "i've": "i have",
    "we've": "we have",
    "they've": "they have",
    "wanna": "want to",
    "can't": "can not",
    "ain't": "are not",
    "isn't": "is not",
    "and/or": "and or",
}

实例

from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
newsgroups = fetch_20newsgroups()


# 缩写词补全映射关系
word_map = {
    "i'll": "i will",
    "it'll": "it will",
    "we'll": "we will",
    "he'll": "he will",
    "they'll": "they will",
    "i'd": "i would",
    "we'd": "we would",
    "he'd": "he would",
    "they'd": "they would",
    "i'm": "i am",
    "he's": "he is",
    "she's": "she is",
    "that's": "that is",
    "here's": "here is",
    "there's": "there is",
    "we're": "we are",
    "they're": "they are",
    "who's": "who is",
    "what's": "what is",
    "i've": "i have",
    "we've": "we have",
    "they've": "they have",
    "wanna": "want to",
    "can't": "can not",
    "ain't": "are not",
    "isn't": "is not",
}


# 检验符号是否有效（字母 和 符号【'】）    
def isValidChar(ch: str):
    if ch.isalpha(): return True
    if ch == "'": return True

# 检验单词是否有效（必须由 字母 和 符号【'】 构成）
def isValidWord(word: str):
    if len(word) > 20: return False
    for ch in word:
        if not isValidChar(ch): return False
    return True

# 处理单词
def handleWord(word: str):
    if word in word_map: return word_map[word]
    if len(word) > 1 and word[-2:] == "'s": return word[:-2]
    return word

# 处理文本
def handleText(text: str):
    text = text.lower().split()
    res = [""]
    for word in text:
        if not isValidChar(word[-1]): word = word[:-1]
        if not isValidWord(word): continue
        res.append(handleWord(word))
    res = " ".join(res).split()
    res = [word for word in res if word not in stop]
    return res

# 处理文本集合
def handleTextList(texts: list):
    res = []
    for i, text in enumerate(texts):
        if (i + 1) % 1000 == 0:
            print("\r\t正在进行过滤：{:.2f}% 共{}条".format((i + 1) * 100 / len(texts), len(texts)), end="", flush=True)
        res.append(handleText(text))
    print()
    return res


if __name__ == '__main__':
    texts = handleTextList(newsgroups['data'])
    # 打印可以看到结果
    # print(text[0])

20200725

import os
import re
from nltk import CoreNLPDependencyParser
import time
import joblib


def get_dataset(name):
    with open(f"dataset/{name}/texts.txt", "r", encoding="utf-8")  as f:
        texts = f.read().strip().split("\n")
    with open(f"dataset/{name}/labels.txt", "r", encoding="utf-8")  as f:
        labels = f.read().strip().split("\n")
    return texts, labels


def clean_text(text: str) -> str:
    text = text.lower()[:8000]
    text = re.sub(r"[^a-z0-9(),.!@?'`]", " ", text)
    text = text.replace("'m ", " am ")
    text = text.replace("'s ", " is ")
    text = text.replace("'ve ", " have ")
    text = text.replace("n't ", " not ")
    text = text.replace("'re ", " are ")
    text = text.replace("'d ", " would ")
    text = text.replace("'ll ", " will ")
    text = text.replace("can't", "can not")
    text = text.replace("ain't", "are not")
    text = text.replace("isn't", "is not")
    text = text.replace(",", " , ")
    text = text.replace("!", " ! ")
    text = text.replace("(", " ( ")
    text = text.replace(")", " ) ")
    text = text.replace("?", " ? ")
    text = text.replace("'", " ' ")
    text = text.replace("`", " ` ")
    text = " ".join(text.split())
    while text.find("..") != -1:
        text = text.replace("..", ".")
    while True:
        count = 0
        for c in ",!()?'`.":
            while True:
                if text.find(f" {c} {c} ") == -1: break
                count += 1
                text = text.replace(f" {c} {c} ", f" {c} ")
        if count == 0: break
    starts = [text.find(w) for w in "abcdefghijklmnopqrstuvwxyz0123456789"]
    start = min([s for s in starts if s != -1])
    ends = [text.rfind(w) for w in "abcdefghijklmnopqrstuvwxyz0123456789"]
    end = max([e for e in ends if e != -1])
    text = text[start:end + 1]
    return text


def parse_text(text: str, nlp):
    data = list(nlp.parse_text(text))
    words, graph = [], [[], []]
    for seq in data:
        nodes = seq.nodes
        num = len(words)
        for i in range(1, len(nodes)):
            node = nodes[i]
            words.append(node['lemma'])
            edges = [(i - 1 + num, n - 1 + num) for deps in node['deps'].values() for n in deps]
            edges = [(s, e) for s, e in edges] + [(e, s) for s, e in edges]
            edges = sorted(list(set(edges)), key=lambda x: (x[0], x[1]))
            graph[0] += [s for s, e in edges]
            graph[1] += [e for s, e in edges]
    text = " ".join(words)
    return text, graph


def parse_texts(texts, nlp):
    t1 = time.time()
    texts_split, graphs = [], []
    print("start")
    for i, t in enumerate(texts):
        if (i + 1) % 100 == 0:
            print(f"{i + 1}/{len(texts)} {(i + 1) * 100 / len(texts):.2f}% cost={time.time() - t1:.2f}")
            t1 = time.time()
        text, graph = parse_text(t, nlp)
        texts_split.append(text)
        graphs.append(graph)
    return texts_split, graphs


names = ["20ng", "mr", "ohsumed", "R8", "R52"]
name = names[0]

texts, labels = get_dataset(name)

root = f"source/{name}"
if not os.path.exists(root):
    os.mkdir(root)

texts_clean = [clean_text(t) for t in texts]
with open(f"{root}/texts_clean.txt", "w", encoding="utf-8")  as f:
    f.write("\n".join(texts_clean))

nlp = CoreNLPDependencyParser()
texts_split, graphs = parse_texts(texts_clean, nlp)

with open(f"{root}/texts_split.txt", "w", encoding="utf-8")  as f:
    f.write("\n".join(texts_split))
joblib.dump(graphs, f"source/{name}/graphs.pkl")