python抓取搜索微信

最新推荐文章于 2024-05-03 11:56:40 发布

zengna_com

最新推荐文章于 2024-05-03 11:56:40 发布

阅读量2k

点赞数

本文链接：https://blog.csdn.net/thundor/article/details/37603061

版权

#coding:utf-8
import urllib
import re
from urllib import quote
import HTMLParser
import time

def decodeHtml(inhtml):
    h = HTMLParser.HTMLParser()
    s = h.unescape(inhtml)
    return s

def strip_tags(html):
    html = html.strip()    
    html = html.strip("\n")    
    result = []    
    parse = HTMLParser.HTMLParser()   
    parse.handle_data = result.append    
    parse.feed(html)    
    parse.close()    
    return "".join(result)

def saveImage(count,url):
    f = open('Images/%d.jpg'%count,'wb')
    f.write(urllib.urlopen(url).read())
    f.close()

def timeSwap(timeStamp):
    timeStamp = int(timeStamp)        
    timeArray = time.localtime(timeStamp)         
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)  
    return otherStyleTime 

def getWeixinInfo(keyword):
    print quote(keyword)
    url = "http://weixin.sogou.com/weixin?type=2&query="+quote(keyword)+"&ie=utf8&_ast=1404888960&_asf=null&w=01029901&cid=null"
    webcontent = urllib.urlopen(url).read()
    title = re.findall('id="\w+_title_\d+">(.*?)</a>',webcontent)
    summary = re.findall('id="\w+_summary_\d+">(.*?)</p>',webcontent)
    timeStamp = re.findall("vrTimeHandle552write\('(.*?)'\)",webcontent)
    imagesInfo = re.findall('<div class="img_box2">[\w\W]+?</div>',webcontent)
    link,imageSrc = [],[]
    for i in range(len(imagesInfo)):
        imgHtml = imagesInfo[i]
        link += re.findall('href="(.*?)"',imgHtml)
        imageSrc += re.findall('src="(.*?)"',imgHtml)
    
    for i in range(len(imageSrc)):
        print "save the %s image"%str(i+1)
        print decodeHtml(link[i])
        print strip_tags(title[i])
        print decodeHtml(imageSrc[i])
        print strip_tags(summary[i])
        print timeSwap(timeStamp[i])
        saveImage(i+1,decodeHtml(imageSrc[i]))
    
getWeixinInfo("世界杯")

zengna_com

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python抓取搜索微信

#coding:utf-8import urllibimport refrom urllib import quoteimport HTMLParserimport timedef decodeHtml(inhtml): h = HTMLParser.HTMLParser() s = h.unescape(inhtml) return s
复制链接

扫一扫