Python网页爬虫简介:
有时候我们需要把一个网页的图片copy 下来。通常手工的方式是鼠标右键 save picture as ...
python 网页爬虫可以一次性把所有图片copy 下来。
步骤如下:
1. 读取要爬虫的html
2. 对爬下来的html 进行存储并处理:存储原始html
过滤生成list
正则匹配出picture的连接
3. 根据连接保存图片到本地
主要的难点:熟悉urllib ,
正则匹配查找图片链接
代码如下:import urllib.request
import os
import redef getHtml(url): #get html
page = urllib.request.urlopen(url)
html = page.read()
return html
def write(html, htmlfile): #write html into a file name html.txt
try:
f = open(htmlfile, mode='w')
f.writelines(str(html))
f.close()
except TypeError:
print ("write html file failed")def getImg2(html, initialFile, finalFile):
reg = '"*' #split string html with " and write in file name re.txt
imgre1 = re.compile(reg)
imglist = re.split(imgre1, str(html))
f1 = open(initialFile, mode='w')
for index in imglist:
f1.write("\n")
f1.write(index)
f1.close
reg2 = "^https.*jpg" # match items start with "https" and ends with "jpg"
imgre2 = re.compile(reg2)
f2 = open(initialFile, mode='r')
f3 = open(finalFile, mode='w')
tempre = f2.readlines()
for index in tempre:
temp = re.match(imgre2,index)
if temp != None:
f3.write(index)
#f3.write("\n")
f2.close()
f3.close()def saveImg2(p_w_picpathfile): #save p_w_picpath
f_imglist2 = open(p_w_picpathfile, mode='r')
templist = f_imglist2.readlines()
x = 0
for index in templist:
urllib.request.urlretrieve(index,'%s.jpg' %x)
x = x + 1html = "https://p_w_picpath.baidu.com/search/index?tn=baidup_w_picpath&ct=201326592&lm=-1&cl=2&ie=gbk&word=%BA%FB%B5%FB&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111"
htmlfile = "D:\\New\\html.txt"
SplitFile = "D:\\New\\re.txt"
imgefile = "D:\\New\\imglist.txt"html = getHtml(html)
print("get html complete!")
getImg2(html, SplitFile, imgefile)
print("get Image link list complete! ")
saveImg2(imgefile)
print("Save Image complete!")