python 吉他_Python爬取17吉他网吉他谱

#-*- coding: utf-8 -*-#coding=UTF8

importosimportsysimportloggingimporturllibimporturllib2importchardetimportreimportcookielibimporturlparsefrom bs4 importBeautifulSoup

sysEncoding=sys.getfilesystemencoding()

cookieJar=cookielib.CookieJar()defget(url):

req=urllib2.Request(url)

opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

response=opener.open(req)returnresponse.read()defdownload_guitar_image(url, target):print 'start download guitar image ...'req=urllib2.Request(url)

req.add_header('Accept','image/webp,image/*,*/*;q=0.8')

opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))

response=opener.open(req)

content=response.read()

with open(target,'wb') as code:

code.write(content)#解析吉他谱图片页面链接地址

defparse_guitar_img_link():

page_list=[]

url_base= 'http://www.17jita.com/'page= 1

whileTrue:

url= url_base + 'tab/img/index.php?page=' +str(page)printurl

html=get(url)

soup= BeautifulSoup(html, "html5lib")

list= soup.select('#ct dl > dt > a')if notlist:break

for item inlist:

page_list.append({'title' : item.text, 'link' : url_base + item['href'] })

page+= 1

returnpage_listdefdownload_guitar_image_link_list(url):

image_link_list=[]

page= 1

whileTrue:

page_url=urlif page > 1:

page_url= url.replace('.html', '' + str(page) + '.html')try:

html=get(page_url)

soup= BeautifulSoup(html, 'html5lib')

img_list= soup.select('#article_contents a > img')for img inimg_list:

image_link_list.append(img['src'])excepturllib2.URLError, e:

msg= u'下载' + page_url + u'出错, 原因:' +e.reasonprintmsg

logging.error(msg)breakpage+= 1

returnimage_link_listif __name__ == '__main__':

logging.basicConfig(

level=logging.DEBUG,

format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',

datefmt='%Y-%m-%d %H:%M:%S',

filename='guitar.log',

filemode='a')

path= 'guitar'

if notos.path.exists(path):

os.mkdir(path)

page_list=parse_guitar_img_link()for page inpage_list:print page['link'] + '(' + page['title'] + ')'guitar_path= path + '/' + (page['title']).encode('GBK')if notos.path.exists(guitar_path):

os.mkdir(guitar_path)

image_link_list= download_guitar_image_link_list(page['link'])for image_link inimage_link_list:print '\t' +image_link

filename= image_link[image_link.rindex('/'):]

filepath= guitar_path + filename.encode('GBK')

download_guitar_image(image_link, filepath)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值