#-*- coding: utf-8 -*-#coding=UTF8
importosimportsysimportloggingimporturllibimporturllib2importchardetimportreimportcookielibimporturlparsefrom bs4 importBeautifulSoup
sysEncoding=sys.getfilesystemencoding()
cookieJar=cookielib.CookieJar()defget(url):
req=urllib2.Request(url)
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
response=opener.open(req)returnresponse.read()defdownload_guitar_image(url, target):print ‘start download guitar image ...‘req=urllib2.Request(url)
req.add_header(‘Accept‘,‘image/webp,image/*,*/*;q=0.8‘)
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
response=opener.open(req)
content=response.read()
with open(target,‘wb‘) as code:
code.write(content)#解析吉他谱图片页面链接地址
defparse_guitar_img_link():
page_list=[]
url_base= ‘http://www.17jita.com/‘page= 1
whileTrue:
url= url_base + ‘tab/img/index.php?page=‘ +str(page)printurl
html=get(url)
soup= BeautifulSoup(html, "html5lib")
list= soup.select(‘#ct dl > dt > a‘)if notlist:break
for item inlist:
page_list.append({‘title‘ : item.text, ‘link‘ : url_base + item[‘href‘] })
page+= 1
returnpage_listdefdownload_guitar_image_link_list(url):
image_link_list=[]
page= 1
whileTrue:
page_url=urlif page > 1:
page_url= url.replace(‘.html‘, ‘‘ + str(page) + ‘.html‘)try:
html=get(page_url)
soup= BeautifulSoup(html, ‘html5lib‘)
img_list= soup.select(‘#article_contents a > img‘)for img inimg_list:
image_link_list.append(img[‘src‘])excepturllib2.URLError, e:
msg= u‘下载‘ + page_url + u‘出错, 原因:‘ +e.reasonprintmsg
logging.error(msg)breakpage+= 1
returnimage_link_listif __name__ == ‘__main__‘:
logging.basicConfig(
level=logging.DEBUG,
format=‘%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s‘,
datefmt=‘%Y-%m-%d %H:%M:%S‘,
filename=‘guitar.log‘,
filemode=‘a‘)
path= ‘guitar‘
if notos.path.exists(path):
os.mkdir(path)
page_list=parse_guitar_img_link()for page inpage_list:print page[‘link‘] + ‘(‘ + page[‘title‘] + ‘)‘guitar_path= path + ‘/‘ + (page[‘title‘]).encode(‘GBK‘)if notos.path.exists(guitar_path):
os.mkdir(guitar_path)
image_link_list= download_guitar_image_link_list(page[‘link‘])for image_link inimage_link_list:print ‘\t‘ +image_link
filename= image_link[image_link.rindex(‘/‘):]
filepath= guitar_path + filename.encode(‘GBK‘)
download_guitar_image(image_link, filepath)