Python运用urllib2和BeautifulSoup爬取网站ZOL桌面壁纸上的精美电脑壁纸
"""
爬取壁纸
"""
from bs4 import BeautifulSoup
import urllib2
import urllib
import re
page_URL=[]
img_URL=[]
def paser_Page_URL(url):
response=urllib2.urlopen(url,timeout=5)
soup=BeautifulSoup(response)
html=soup.find_all(id='pageNext')
i=html[0]
if i is not None:
the_url=i.get('href')
if the_url=='javascript:;':
return get_Next_Page_URL(url)
else:
the_url='http://desk.zol.com.cn'+the_url
return the_url
def paser_Img_URl(url):
response=urllib2.urlopen(url)
soup=BeautifulSoup(response)
html=soup.find_all(id='bigImg')
i=html[0]
if i is not None:
img_url=i.get('src')
return img_url
def download_Img(img_url):
x=0
for url in img_url:
urllib.urlretrieve(url,'D:\IMGS\%s.jpg' % x)
print '下载第 %s 张完成' % x
x+=1
def get_Next_Page_URL(url):
response=urllib2.urlopen(url)
soup=BeautifulSoup(response)
html=soup.select('.txt')
string=html[1]
string=str(string)
linkPattern = re.compile("href=\"(.+?)\"")
match=re.findall(linkPattern,string)
the_url=match[0]
the_url='http://desk.zol.com.cn'+the_url
return the_url
root_url='http://desk.zol.com.cn/bizhi/7089_87888_2.html'
num=0
while num<=10:
url=paser_Page_URL(root_url)
print url
if url is not None:
page_URL.append(url)
root_url=url
img_url=paser_Img_URl(url)
img_URL.append(img_url)
print '添加',num
num+=1
else:
num+=1
download_Img(img_URL)