python 下载微信公众号文章,含图片,并对文本分词, 并 搜索某个词,然后替换为 粗体 红色



#-*- coding:utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding("gbk")

from re import findall
import requests
import re
from urllib import urlopen

from bs4 import BeautifulSoup


def modifyip(tfile,sstr,rstr):

    try:

        lines=open(tfile,'r').readlines()

        flen=len(lines)-1

        for i in range(flen):

            if sstr in lines[i]:

                lines[i]=lines[i].replace(sstr,rstr)

        open(tfile,'w').writelines(lines)

        

    except Exception,e:

        print e



url = 'http://mp.weixin.qq.com/s/l9J5CLSR2Oo83QwQx119vw'
content = urlopen(url).read()
f1 = file('p.htm','wb')
f1.write(content)
f1.close()
#f2=file('pic.txt','wb')

import os
import math       
altxt = open('p.htm').read()
pattern = '<script(.+?)</script>'
#out = re.sub(pattern,' ', altxt)
listpp=re.split(r'<script|</script>|<style>|</style>',altxt)
f1 = file('p1.htm','a+')
for i in listpp:
   print listpp.index(i)
   if (listpp.index(i)+2)%2  == 0:
     print i
   
     f1.write(i)
     
f1.close()

altxt = open('p1.htm').read()
pattern = '<script(.+?)</script>'
#out = re.sub(pattern,' ', altxt)
listpp=re.split(r'<|>',altxt)
f1 = file('p2.htm','a+')
for i in listpp:
   print listpp.index(i)
   if (listpp.index(i)+2)%2  == 0:
     print i

     f1.write(i+chr(10))
     
f1.close()

#f4=open('p2.htm','r+')
#altxt = f4.read()
#altxt =  altxt.replace('head','')
#print altxt
#f4.write(altxt)
#f4.close()

for i in range(10):

 with open('p2.htm','r+') as f:
    t = f.read()
    t = t.replace('/head', '')
    t = t.replace('head', '')
    t = t.replace('/body', '')
    t = t.replace('body', '')
    t = t.replace('/html', '')
    t = t.replace('html', '')
    #t = t.replace('&nbsp;', '')

   #读写偏移位置移到最开始处
    f.seek(0, 0)    
    f.write(t)



def delblankline(infile,outfile):
    infopen = open(infile,'r')
    outfopen = open(outfile,'w')
    lines = infopen.readlines()
    for line in lines:
        if line.split():
            outfopen.writelines(line)
        else:
            outfopen.writelines("")
    infopen.close()
    outfopen.close()
    
delblankline("p2.htm","p3.txt")

pattern = ' data-src="(.+?)"'
pattern1 = ' data-src="(.+?)"'

content = open('p.htm').read()
result = findall(pattern, content)
#print result


#content = open('p.htm').read()
#pattern1 = ' data-src="(.+?)"'
#result1 = findall(pattern1, content)
#for index, item in enumerate(result):

for index, item in enumerate(result):
    
    
    if not str(item).find('png')==-1:
         data = urlopen(str(item)).read()
         #f2.write(str(item)+'"  />'+'<img src="'+str(index)+'.png"  />')
         modifyip('p.htm',' data-src="'+str(item),' src="'+str(index)+'.png" ')
        
         f = file(str(index)+'.png',"wb")
         f.write(data)
         f.close()

    if not str(item).find('jpeg')==-1:
         data = urlopen(str(item)).read()
         
         modifyip('p.htm',' data-src="'+str(item),' src="'+str(index)+'.jpg" ')
         
         f = file(str(index)+'.jpg',"wb")
         f.write(data)
         f.close()

    if not str(item).find('gif')==-1:
         data = urlopen(str(item)).read()
         
         modifyip('p.htm',' data-src="'+str(item),' src="'+str(index)+'.gif" ')
         
         f = file(str(index)+'.gif',"wb")
         f.write(data)
         f.close()  

 



os.remove('p1.htm')
os.remove('p2.htm')

#//

#import time
import os,sys,re

import numpy as np

#from math import isnan

import pandas as pd
 

import jieba
import jieba.analyse

#import xlrd
#import xlwt


import codecs



#不要选择太普通的词, 否则文件太大。

#jieba 所有词
#带分页的文章
#strzk3="""网络的资源占用情况"""
#listzk1=list(jieba.cut(strzk3))


##读取文本文件内容到内存
file = codecs.open("p3.txt", 'r')
content = file.read()
file.close()
segments = []
##对内存中的字符串进行分词
listzk1 = list(jieba.cut(content))
##取中英文字符超过2个的词存入数组segments
#for seg in segs:
#    if len(seg) > 1:
#        segments.append(seg)




listzk = []
the_set = set()
for level in listzk1:
    if level not in the_set:
        the_set.add(level)
        listzk.append(level)


df = pd.DataFrame(listzk,columns=['word'])

df2=df.sort_values(by=['word'],ascending=False)  #这个排序结果不是按照中文拼音排序的


df2.to_csv('test.csv',  sep=',',header=False,index=False)  #打开文件后,再按中文拼音排序

f=open('3.txt','w+')  

import shutil

for indexs in df2.index:
   #print(df2['word'].iloc[indexs])
   f.write(df2['word'].iloc[indexs]+chr(10))  
   word=df2['word'].iloc[indexs].encode('utf8')
 
 
   shutil.copyfile("p.htm", "x "+'%d' %(indexs+1)+'.htm')
   modifyip("x "+'%d' %(indexs+1)+'.htm',word,'<p style="color:red"><B>'+word+'</B><p style="color:black">')

f.close()

#word="丰富".encode('utf8')
 
 
#shutil.copyfile("p.htm", word.decode('gbk')+'.htm')
#modifyip(word.decode('gbk')+'.htm',word,'<p style="color:red"><B>'+word+'</B><p style="color:black">')



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值