#coding:utf-8
import urllib
import re
from urllib import quote
import HTMLParser
import time
def decodeHtml(inhtml):
h = HTMLParser.HTMLParser()
s = h.unescape(inhtml)
return s
def strip_tags(html):
html = html.strip()
html = html.strip("\n")
result = []
parse = HTMLParser.HTMLParser()
parse.handle_data = result.append
parse.feed(html)
parse.close()
return "".join(result)
def saveImage(count,url):
f = open('Images/%d.jpg'%count,'wb')
f.write(urllib.urlopen(url).read())
f.close()
def timeSwap(timeStamp):
timeStamp = int(timeStamp)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
def getWeixinInfo(keyword):
print quote(keyword)
url = "http://weixin.sogou.com/weixin?type=2&query="+quote(keyword)+"&ie=utf8&_ast=1404888960&_asf=null&w=01029901&cid=null"
webcontent = urllib.urlopen(url).read()
title = re.findall('id="\w+_title_\d+">(.*?)</a>',webcontent)
summary = re.findall('id="\w+_summary_\d+">(.*?)</p>',webcontent)
timeStamp = re.findall("vrTimeHandle552write\('(.*?)'\)",webcontent)
imagesInfo = re.findall('<div class="img_box2">[\w\W]+?</div>',webcontent)
link,imageSrc = [],[]
for i in range(len(imagesInfo)):
imgHtml = imagesInfo[i]
link += re.findall('href="(.*?)"',imgHtml)
imageSrc += re.findall('src="(.*?)"',imgHtml)
for i in range(len(imageSrc)):
print "save the %s image"%str(i+1)
print decodeHtml(link[i])
print strip_tags(title[i])
print decodeHtml(imageSrc[i])
print strip_tags(summary[i])
print timeSwap(timeStamp[i])
saveImage(i+1,decodeHtml(imageSrc[i]))
getWeixinInfo("世界杯")
python抓取搜索微信
最新推荐文章于 2024-05-03 11:56:40 发布