__author__ = 'AllenMinD'
import requests,urllib
from bs4 import BeautifulSoup
ans = 1
for page in range(1,12):
if page==1:
url = 'http://www.gamersky.com/ent/201603/730123.shtml'
else:
url = 'http://www.gamersky.com/ent/201603/730123_'+str(page)+'.shtml'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
,'Cookie':'BIDUPSID=5B700B9ED7BFDE99E48407F4C10FABAA; BAIDUID=05F28292EA8DA5A589737ACF26DD1B31:FG=1; PSTM=1456985091; BDUSS=1hczlEYmxKckJPbU9CRDE0R1hQcWtOOWJIQ2JQY1BRckQ2OW9kdWNnfmhTUjVYQVFBQUFBJCQAAAAAAAAAAAEAAABHG40~AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOG89lbhvPZWaU'}
source_code = requests.get(url,headers = header)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'lxml')
# print soup.select('p > a')
download_link = []
for pic_tag in soup.select('p > a'):
#print pic_tag.get('href')[52:]
download_link.append(pic_tag.get('href')[52:])#*********重点*********
folder_path = "D:/spider_things/2016.4.4/bizhi/"
for item in download_link:
urllib.urlretrieve(item , folder_path + str(ans) + '.jpg')
print 'You have downloaded',ans,'picture(s)!~'
ans = ans+1
前些天想做下游民星空壁纸的爬去,但是一开始想平常那样爬取时,最终下载下来的图片会损坏:
于是,上网查找问题所在,在一个网站中得到一点启发(点击打开链接):初步猜测出错的原因是图片的链接不对
然后我在了看原来爬取下载图片的链接是:
http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2016/03/20160319_hc_44_10/gamersky_005origin_009_201631919596C4.jp
g
但实际上,下载的原图的链接是:
http://img1.gamersky.com/image2016/03/20160319_hc_44_10/gamersky_005origin_009_201631919596C4.jpg
终于!发现问题了,原来的链接中多了:“http://www.gamersky.com/showimage/id_gamersky.shtml?”
于是利用一个简单的切片便获取了正确的链接:
pic_tag.get('href')[52:]
问题终于解决了!~o(∩_∩)o 哈