利用python爬虫结合前端技能实现经济学人(The Economist)阅时即查APP(011)

python实现数据清洗,过滤出每篇文章中有一定难度的单词

#coding=utf-8
from collections import Counter#用于词频统计
import re
import os
import json
import time
import sys
from sys import argv

exclude = []
with open('simpleWords.json','r')as fo:
    exclude = json.load(fo)
time.sleep(1)

def getOrigianlForm(word):
    if not word.isalpha():
        return False
    if word in exclude:
        return False
    if word.endswith('ment') or word.endswith('ness'):
        word = word[0:-4]
        if word in exclude:
            return False
    if word.endswith('tion'):
        word = word[0:-4]
        if word in exclude:
            return False
        word += 't'
        if word in exclude:
            return False
        word = word[0:-1] +'e'
        if word in exclude:
            return False

    if word.endswith('ing'):
        word = word[0:-3]
        if word in exclude:
            return False
        word += 'e'
        if word in exclude:
            return False
    if word.endswith('ies'):
        word = word[0:-3]
        if word in exclude:
            return False
        word += 'y'
        if word in exclude:
            return False
    if word.endswith('es'):
        word = word[0:-2]
        if word in exclude:
            return False
        word += 'e'
        if word in exclude:
            return False
        word += 's'

    if word.endswith('ers'):
        word = word[0:-1]
        if word in exclude:
            return False
        word = word[0:-1]
        if word in exclude:
            return False
        word = word[0:-1]
        if word in exclude:
            return False
        return True

    if word.endswith('est'):
        word = word[0:-3]
        if word in exclude:
            return False
        word = word[0:-1]
        if word in exclude:
            return False

    if word.endswith('ied'):
        word = word[0:-2]
        if word in exclude:
            return False
        word += 'y'
        if word in exclude:
            return False
        return True

    if word.endswith('ted') or word.endswith('ded'):
        word = word[0:-2]
        if word in exclude:
            return False
        word = word[0:-1]
        if word in exclude:
            return False
        return True

    if word.endswith('ed'):
        word = word[0:-2]
        if word in exclude:
            return False
        word += 'e'
        if word in exclude:
            return False
        return True

    if word.endswith('s') and len(word)>3:
        word = word[0:-1]
        if word in exclude:
            return False

    if word.endswith('ly'):
        if word.endswith('ily'):
            word = word[0:-3]+'y'
            if word in exclude:
                return False
        word = word[0:-2]
        if word in exclude:
            return False
    return True

todayAll = []
def wc(filename,outPath):
    global todayAll
    resultDict = []
    wordlst = None
    with open(filename, 'r') as fwc:
        for line in fwc:
            content = re.sub('[-\"\|,.)(“”]', " ",line.lower())
            lst = content.split(' ')
            lst1 = [i.lower() for i in lst if len(i)>2 and getOrigianlForm(i) and i.isalpha()]
            resultDict.extend(lst1)


    wordlst = Counter(resultDict)
    dicLen = len(wordlst)

    mb = wordlst.most_common(20)
    mbai = [item[0] for item in mb]
    print('--------------------------------------------------------------most 100')
    print(','.join(mbai))

    allWordsPre = wordlst.most_common(dicLen)
    result = []
    allwords = [item[0] for item in allWordsPre if len(item[0]) >2 and item[0] not in exclude]
    for i in allwords:
        if i.endswith('ing'):
            i = i[0:-3]
        if i.endswith('ings'):
            i = i[0:-4]
        if i.endswith('ers'):
            i = i[0:-1]
        if i.endswith('ies') or i.endswith('ied'):
            i = i[0:-3]+'y'
        if i.endswith('ded'):
            i = i[0:-2]
        if i.endswith('es') or i.endswith('ts'):
            i = i[0:-1]
        if i in exclude:
            continue
        if i.isalpha():
            result.append(i)
    baiStr = ','.join(mbai)+'\n\n'
    datas = baiStr+','.join(list(set(result)))
    todayAll.append(','.join(list(set(result))))
    print('--------------------------------------------------------------all words string')
    print(datas)
    with open(outPath,'w') as fo:
        fo.write(datas)

if __name__ == "__main__":
    try:
        script_name,dateStr = argv
    except Exception as err:
        print(err)
        dateStr = time.strftime('%Y-%m-%d',time.localtime(time.time()))

    toYear,toMonth,toDay = list(map(int,dateStr.split('-')))
    readPath = './mds/' + dateStr +'/papers/'
    outPath = './mds/' + dateStr +'/words/'
    readMds = []
    testGo = False
    if testGo:
        wc('todaymd.md','todaymd.txt')
        sys.exit()
    if os.path.exists(readPath):
        for item in os.listdir(readPath):
            readMds.append([readPath+item,item])
    else:
        print('Dir not found!')
        os.makedirs(readPath)
        sys.exit()
    if len(readMds) <= 0:
        print('NO PAPER TO FILTER!')
        sys.exit()

    if not os.path.exists(outPath):
        os.makedirs(outPath)

    for paper in readMds:
        countPaper = paper[0]
        outWords = outPath + paper[1][0:-2]+'txt'
        print(countPaper,outWords)
        wc(countPaper,outWords)
    todayallstr = ','.join(todayAll)
    todayallstr = todayallstr.split(',')
    todayallstr = list(set(todayallstr))
    todayallstr = ','.join(todayallstr)
    todayAllWords = './mds/' + dateStr + '/allWords.txt'
    with open(todayAllWords,'w')as foo:
        foo.write(todayallstr)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值