背景:
放假在家里没事鼓捣了一个用python实现的简单网页信息抓取的程序。
demo功能:
将千图网的商业海报http://www.58pic.com/topic/419-1.html的(海报名字 ,海报详情页面地址,海报缩略图地址)通过正则匹配提取出来,然后写入到filed='python_load_demo.txt'中,然后把第一个海报缩略图下载到文件夹imgpath = "python_img"中。
程序版本:
python3.5.2
代码:
#coding=gbk
import os
import urllib.request
import re
def getPage(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=url, headers=headers)
page = urllib.request.urlopen(req)
return page
def getHtml(url):
page=getPage(url)
html = page.read().decode('gbk')#根据网页编码格式可以改变"gbk"
return html
def getImg(html,reg):
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
def loadImg(url,path):
page=getPage(url)
data=page.read()
f1=open(path,"wb")
f1.write(data)
f1.close()
def loadList(path,d):
f1=open(path,"a+")
str1 = ""
for t in d:
s = ''
for v in t:
s=s+'\t'+v
str1 = str1+s+"\r\n"
f1.write(str1)
f1.close()
#千图网主页
url='http://www.58pic.com/topic/419-%d.html';
start_num=1
end_num=5
#正则1(海报图片链接,海报详情页地址,海报名)
reg=r'src="([^"]*jpg).*?<p><a href="([^"]*)" target="_blank" title="[^"]*">([^<]*)</a></p>'
#导出列表文件名
filed='python_load_demo.txt'
imgpath = "python_img"
if not os.path.exists('.\\'+imgpath):
os.mkdir('.\\'+imgpath)
for i in range(start_num,end_num+1):
turl = (url) % i
html = getHtml(turl)
d = getImg(html,reg)
#如果查找有结果
if len(d)>0:
print(len(d))
loadList(filed,d)
loadImg(d[0][0],(imgpath + "/%s.jpg") % d[0][2])