python爬虫是数据挖掘与数据处理中的重要一部分,为了让大家深入了解爬虫,这里会更新爬虫系列教程及例子。第一篇来送福利啦!!!http://jandan.net/ooxx,先进来看一眼,是不是动力十足。
import urllib.request
import os
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page')+23
b = html.find(']',a)
return html[a:b]
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append("http:"+ html[a+9:b+4])
else:
b = a + 9
a = html.find('img src=',b)
return img_addrs
def save_imgs(folder,img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder='ooxx',pages=10):
os.mkdir(folder)
os.chdir(folder)
url = "http://jandan.net/ooxx/"
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
page_url = url + 'page-' + str(page_num) + '#comments'
img_addrs = find_imgs(page_url)
save_imgs(folder,img_addrs)
if __name__=='__main__':
download_mm()
优化版:
#煎蛋网爬图
#导入了BeautifulSoup4(正则)和requests(代替urllib2,但是我不会使,还是用的lib2)包
#注意1:煎蛋的页数是倒着来的,从大往小来,输入的时候要注意
#注意2:报错:HTTPError: Service Temporarily Unavailable,换个伪装的浏览器或许可行
#import requests
import urllib2
import re
import os
from bs4 import BeautifulSoup
#访问网址,得到图片的地址,存在一个列表中
def getjpg(startpage,endpage):
myjpgs = []
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
headers = {'User-Agent':user_agent}
for i in range(startpage,endpage+1):
url = 'http://jandan.net/ooxx/page-' + str(i) + '#comments'
req = urllib2.Request(url,headers=headers)
res = urllib2.urlopen(req)
#BeautifulSoup和正则表达式re作用一样,使用前要先安装包,语法我也不太了解,这个select('p > img')就是匹配所有<p>标签下的所有<img>标签
jpgs = BeautifulSoup(res).select('p > img')
alljpgs = re.findall('src="(.*?)"',str(jpgs),re.S)
#通过type()可知这都是列表,可以列表合并
myjpgs = myjpgs + alljpgs
return myjpgs
#保存到本地
def savejpg(myjpgs):
j = 1
for myjpg in myjpgs:
# os.path.splitext()获取文件后缀名,此函数把前边的名称和后缀名作为两个元素存储为列表,[0]是前边的[1]是后缀名
lastname = os.path.splitext(myjpg)[1]
#用with open() as 函数操作文件更规范,‘wb’是二进制模式,as后边的名称自己随便取
with open('E:\\pytest\\pyget\\test25_jiandan\\' + str(j) + str(lastname),'wb') as op:
print u'正在保存第%d张……' % j
j = j+1
#我们得到的是图的地址,要像网址一样打开访问一次,然后读一下再写入,直接写的话是个字符串
m = urllib2.urlopen(myjpg)
op.write(m.read())
op.close()
#程序开始
endpage = int(raw_input(u'请输入起始页数:'))
startpage = int(raw_input(u'请输入终止页数:'))
savejpg(getjpg(startpage,endpage))
这里一定要去了解怎么去获得图片地址的原理以及怎样反反爬~