背景:虚拟机ubuntu16.04利用xpath与爬取www.uumnt.cc/图片
当然,我们要爬取的是动物板块!
程序分析,将动物板块一页一页分析拿取出来,然后拿去各种动物页面的链接,然后对链接分析拿取图片(每个链接拿取4张图)
效果为:
源码如下:
1 # -*- coding:utf-8 -*-
2
3 #准备爬取https://www.uumnt.cc/dongwu/的一些图片
4
5 import urllib
6 import urllib2
7 import re
8 import random
9 from lxml import etree
10
11
12 def loadPage(url):
13 #拿到每一页的html源码
14 headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
15 request = urllib2.Request(url,headers=headers)
16 html = urllib2.urlopen(request).read()
17 #print html 检测用
18
19 content = etree.HTML(html)
20 #返回所有匹配成功的列表集合
21 link_list = content.xpath('//div[@class="best-pic-c clearfix"]/ul/li/a[@class="best-pic-c-pic"]/@href')
22
23 #print link_list 检测用
24 for link in link_list:
25 fulllink = 'https://www.uumnt.cc'+link
26 #拿取了相对应的动物的网页
27 #print fulllink 检测用
28 loadsunPage(fulllink)
29
30 #拿取子网页的信息
31 def loadsunPage(url):
32 # writeImage(url)#提取第一个子网页
33 url_ = re.match(r"(https://www.uumnt.cc/dongwu/)+(\d*)",url)
34 url_sre =url_.group()
35 #print url_sre 检测用
36 for i in range(2,6):
37 a = "_%d.html"%i
38 url = url_sre + a
39 #print url
40 writeImage(url)
41
42 def writeImage(url):
43 #拿到html源码,xpath提取出图片链接
44 headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
45 request = urllib2.Request(url,headers=headers)
46 html = urllib2.urlopen(request).read()
47
48 content = etree.HTML(html)
49 #返回所有匹配成功的列表集合
50 link_list = content.xpath('//img[@class="center other_cont_pics"]/@src')
51 #print link_list
52 for link in link_list:
53 #print link
54 loadImage(link)
55
56 #下载图片
57 def loadImage(link):
58 #下载
59 headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
60 request = urllib2.Request(link,headers = headers)
61 image = urllib2.urlopen(request).read()
62
63 a = random.randint(1,100000000)
64 filename = str(a)
65
66 with open('/home/cl/桌面/uumntanimal'+filename+'.jpg',"wb") as f:
67 f.write(image)
68 print "download successful-" +filename+".jpg"
69
70
71
72 if __name__ == "__main__":
73 url = "https://www.uumnt.cc/dongwu/"
74 #为了方便,从第二页开始爬取
75 print '请输入需爬取的页数:',
76 a = input()
77 for i in range(2,a+1):
78 #print url
79 url = 'https://www.uumnt.cc/dongwu/list_%d.html'%i
80 loadPage(url)
81
代码中有很多print link之类的是为了调试检测程序,对新手来说很好用!