python爬虫入门,使用urllib爬取一个网站的图片。
- 用beautifulsoup来解析文档中的链接、
- 使用urllib来打开并进行下载图片
- 使用多线程同时对网页进行爬去和下载
程序解构,程序主要是从网站的一个网页开始,首先获取网站的所有图片的主链接放到一个集合里面,然后打开这些图片的链接下载里面的图片,其中获取图片分类的主链接用了一个线程,然后获取所有的图片的地址使用了一个链接,最后使用三个线程进行图片的下载。其中涉及到的问题下载过快导致获取图片地址的线程来不及获取图片的地址,同理于后去分类的图片。这一个典型的生产者消费者问题。这里爬去的网站是www.meizitu.com
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup,SoupStrainer
from threading import Lock,Thread
import sys,time,os
from urlparse import urlparse, urljoin
from urllib2 import urlopen
from urllib import urlretrieve
begin = 'http://www.meizitu.com/a/xinggan.html'
picset = set()
meiziset = set()
look = Lock()
其中我将代码分成了几个函数,本来是想用面向对象的方式进行的,但是突然发现我想了半天不知道自动手所有就直接写成方法了。
def getHtml(url):
'''
@url:需要获取html文档的url
下载url的html文档
'''
f = urlopen(url)
if f.geturl() != url:
print 'chong ding xiang ',f.geturl
return None
data = f.read()
f.close()
return data
获取url中的链接
def getURLlist(data,so=None,finds=None,ss = 'href'):
'''
@data:下载到的html文档
@so:SoupStrainer的一个实例
@finds:查找的内容的标签
@ss:需要从标签中获取的内容
'''
soup = BeautifulSoup(data,'html.parser')
links = soup.find(so)
for x in links.find_all(finds):
yield x[ss]
#return set(x[ss]for x in links.find_all(finds))
def download(url):
'''
@url:图片的src
'''
site = urlparse(url).netloc.split('@')[-1].split(':')[0]
if site.startswith('www'):
print 'skipping this url'
return
path = url[-18:]
dirs = '/home/young/mei/'
name = dirs+path.replace('/','_')
if not os.path.exists(name):
data = urlretrieve(url,name)
else:
print 'cunzai'
def getPageUrl():
'''
从页面中解析主图片的地址
'''
global begin
global picset
data = getHtml(begin)
so = SoupStrainer('div',class_="tags")
for cs in set(getURLlist(data,so,'a')):
print '\nfrom ',cs,"get html"
data = getHtml(cs)
so = SoupStrainer(class_='wp-list clearfix')
s = getURLlist(data,so,'a')
with look:
picset.update(s)
so = SoupStrainer('div',id='wp_page_numbers')
numset = set(urljoin('http://www.meizitu.com/a/',x)for x in getURLlist(data,so,'a'))
print 'there are ',len(numset),'numbers'
for nu in numset:
print nu
data = getHtml(nu)
so = SoupStrainer(class_='wp-list clearfix')
lists = getURLlist(data,so,'a')
with look:
picset.update(lists)
ef getPicUrl():
'''
下载图片的主要方法
'''
global picset
while True:
with look:
try:
url = picset.pop()
except KeyError:
print 'pic is empty'
break
print 'from picset ',url
data = getHtml(url)
so = SoupStrainer('div',class_="postContent")
lists = getURLlist(data,so,'img','src')
with look:
meiziset.update(lists)
def getPic():
'''
下载图片的主地址
'''
global meiziset
while True:
with look:
try:
url = meiziset.pop()
except KeyError:
print 'download error'
break
print 'download ',url
download(url)
def main():
print 'begin page_thread'
page_thread = Thread(target=getPageUrl)
page_thread.start()
time.sleep(20)
print 'begin url_thread'
url_thread = Thread(target=getPicUrl)
url_thread.start()
time.sleep(40)
print 'begin pic_thread'
pic_thread = Thread(target=getPic).start()
time.sleep(60)
print '\n start two threading'
pic_thread1 = Thread(target=getPic).start()
pic_thread3 = Thread(target=getPic).start()
time.sleep(60)
print '\n start two threading'
pic_thread2 = Thread(target=getPic).start()
pic_thread4 = Thread(target=getPic).start()