nltk 同义词替换 单词拼写校正 制作伪原创文章

一.基于贝叶斯单词拼写校正
# -*- coding: utf-8 -*-
# @Time    : 2019/11/26 10:13
# @Author  :
# @FileName: word_check.py

import os
import re
import collections

# 下载词库big.txt文件到本地
# import requests
# url = "http://norvig.com/big.txt"
# response = requests.get(url=url)
# with open("big.txt","w",encoding="utf-8") as f:
#     f.write(response.text)

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model


filepath = os.path.dirname(__file__)
NWORDS = train(words(open('%s/big.txt' % filepath).read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts = [a + c + b for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)


def known(words): return set(w for w in words if w in NWORDS)


def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)


if __name__ == '__main__':
    print(correct("Hammett"))

二.同义词替换生成伪原创文章
# -*- coding: utf-8 -*-
# @Time    : 2019/11/26 10:13
# @Author  :

import re

import nltk
import inflect
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from word_check import correct

# 缩略词还原
replacement_patterns = [
    ("won\'t", "will not"),
    ("won\’t", "will not"),
    ("can\'t", "cannot"),
    ("can\’t", "cannot"),
    ("i\'m", "i am"),
    ("i\’m", "i am"),
    ("ain\'t", "is not"),
    ("ain\’t", "is not"),
    ("(\w+)\'ll", "\g<1> will"),
    ("(\w+)\’ll", "\g<1> will"),
    ("(\w+)n\'t", "\g<1> not"),
    ("(\w+)n’t", "\g<1> not"),
    ("(\w+)\'ve", "\g<1> have"),
    ("(\w+)\’ve", "\g<1> have"),
    ("(\w+)\'s", "\g<1> is"),
    ("(\w+)\’s", "\g<1> is"),
    ("(\w+)\'re", "\g<1> are"),
    ("(\w+)\’re", "\g<1> are"),
    ("(\w+)\'d", "\g<1> would"),
    ("(\w+)\’d", "\g<1> would")
]


class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        """缩略词还原"""
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]

    def rep(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


def generate_the_article(text):
    """
    伪原创文章生成
    :param text:
    :return:
    """
    replacer = RegexpReplacer()
    text = replacer.rep(text)

    # 将文章切割成句子
    setences = nltk.sent_tokenize(text)
    all_world = []
    for setn in setences:

        # 词性分析
        tokens = nltk.word_tokenize(setn)
        pos_tags = nltk.pos_tag(tokens)
        new_world = []

        for word, pos in pos_tags:
            pos_list = ["VBG", "JJ", "NN", "NNS", "MD", "VB", "VBD"]
            if pos in pos_list:
                word_list = []

                # 同义词替换
                if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
                    word_set = wn.synsets(word, pos='n')
                elif pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', "MD"]:
                    word_set = wn.synsets(word, pos='v')
                elif pos in ['RB', 'RBR', 'RBS']:
                    word_set = wn.synsets(word, pos='r')
                elif pos in ['JJ', 'JJR', 'JJS']:
                    word_set = wn.synsets(word, pos='a')
                else:
                    word_set = wn.synsets(word)
                w_list = list(i._lemma_names for i in word_set)
                for w in w_list:
                    word_list.extend(w)
                if word_list:
                    word_set = set(word_list)
                if word_set:
                    word_set.discard(word)
                    if word_set:
                        word_set.discard(word)
                        if word.istitle():
                            nw = word_set.pop().replace("_", " ").capitalize()
                        else:
                            p_word = word_set.pop().replace("_", " ")
                            if p_word.lower() == word.lower():
                                nw = word
                            else:
                                nw = p_word
                        if pos in ["VBG"]:

                            # 动词现在进行时转换
                            nw = p.present_participle(nw)
                            # 单词拼写校正
                            nw = correct(nw)

                        elif pos in ["NNS", "NNPS"]:
                            if not nw.endswith("s"):
                                # 名词复数转换
                                nw = p.plural_noun(nw)
                                # 单词拼写校正
                                nw = correct(nw)
                    else:
                        nw = word
                else:
                    nw = word
                new_world.append(nw)
            else:
                new_world.append(word)
            if new_world:
                if new_world[-1] in [',', ':', ',', '.', ';', '。', ';', '-', '—', '?', '?', '!', '!', ']', '】', '}',
                                     '}', ')',
                                     ')', '|']:
                    if len(new_world) > 1:
                        point = new_world.pop(-1)
                        new_world[-1] = new_world[-1] + point

        all_world.extend(new_world)

    return " ".join(all_world).replace("( ", "(").replace("( ", "(").replace('[ ', "[").replace('【 ', '【', ).replace(
        '{ ', '{', ).replace('{ ', '{')


def main():
    global p, lemmatizer
    text = """NORMANDY PARK, Wash. — There was just one problem with the brand-new, wide-open layout of Kay and Bob Comiskey's home: it was really open.
"We remodelled and didn't have money for furniture," says Kay. "We lived for three years with almost nothing. We had one sectional we'd move from room to room. The UPS driver asked if we ballroom-danced."
They did not. But the Comiskeys are promenading on air these days, now that Bjarko5/8Serra Architects (who created those wide-open spaces) connected them with Amy May, of MAY Designs (who creatively filled those wide-open spaces).
The Comiskeys love their family-friendly neighbourhood and their delightful midcentury home (originally an old, dark A-frame), which served admirably for years as a flexible, active hub for their three kids and their coming-and-going lifestyle. But once the nest emptied, May says, "They wanted a new way to enjoy the house, such as creating intimate gathering spaces with dedicated furniture."
May didn't have to look far for decor inspiration: sand and saltwater shimmer forever, just outside a west-facing wall of windows.
"The clients wanted to maintain a neutral palette that acted as a backdrop and setting for the natural beauty of the Puget Sound," May says.
And now, a beautiful blend of natural simplicity and industrial touches artfully flows through the reimagined first floor, in driftwood, coral and beachy glass; colour-popping art pieces; and all-new, fantastically functional furniture whose only movement is the occasional swivel of a purposely placed chair.
•In the warmly welcoming living room, May softened the existing two-storey, black-clad fireplace with a giant, artsy, battery-operated clock that hangs from the 20-foot ceiling. 
"It casts interesting shadows and helps break up the mass of the black background," she says. 
"""
    p = inflect.engine()
    lemmatizer = WordNetLemmatizer()
    text_nw = generate_the_article(text=text)
    print(text_nw)

    # 借助第三方平台进行语法校正
    # print(grammar_check(text_nw))


if __name__ == '__main__':
    main()

# 词性分析: https://www.jianshu.com/p/418cbdbf5e20
# 同义词替换: https://blog.csdn.net/jining11/article/details/89458865
# 词性还原: https://pypi.org/project/inflect/

# 单词拼写校验 https://blog.csdn.net/Pwiling/article/details/50573650
# textblob https://textblob.readthedocs.io/en/dev/quickstart.html
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Cocktail_py

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值