最近感觉找乐谱下载之类比较麻烦, 于是花了两天时间写了个爬虫程序, 效率倍增。
使用方法: 在https://www.17jita.com上搜索乐谱,点开第一个界面,将网址按程序内格式赋给url_sa下,然后运行,便可以在D:/pic/目录下获得乐谱。
效果如下:
源码(日后有时间会继续完善):
import urllib.request
import re
import os
from bs4 import BeautifulSoup
#database
recipeWeb = "https://www.17jita.com/"
recipeWeb_len = len(recipeWeb)
webList = []
FinalList = []
url_sa = "tab/img/4289"
url = recipeWeb + url_sa + '.html'
def getList(url):
data = getData(url)
pre = r'href='
post = r'.*?html'
webList = findPreAndPost(pre, post , data)
return webList
def findPreAndPost(pre, post, data):
loc = []
res = []
length = re.split(r'\s', data)
for i in length:
if(re.match(pre,i)):
loc.append(i)
for i in loc:
if(re.match(post,i)):
res.append(i)
return res
def getData(url):
webPage = urllib.request.urlopen(url)
data = webPage.read()
data = data.decode('gbk')
return data
def GetPic(url, cnt):
data = getData(url)
soup = BeautifulSoup(data, 'html.parser')
length = re.split(r'\s', data)
title = soup.find('h1', {'class': 'ph'}).text
loc = []
res = []
for i in length:
if(re.match(r'src=',i)):
loc.append(i)
for i in loc:
if(re.match(r'.*?png',i)):
res.append(i)
for pic in res:
tmp = re.search(r'src="(.*?)"',pic)
pic_url = tmp.group(1)
print(pic_url)
if(pic_url[0]=='/' or pic_url[0]!='h'): continue;
pic_web = urllib.request.urlopen(pic_url)
pic_data = pic_web.read()
path = "d:/pic/"+title+'/';
if not os.path.exists(path):
os.makedirs(path)
f = open(path+title+str(cnt)+".png","wb")
f.write(pic_data)
f.close()
print(pic)
webList = getList(url)
FinalList.append(url)
for i in webList:
cnt = 0
searchStr = 'href="'+ url_sa + '(.*?)"'
tmp = re.search(searchStr, i)
if tmp:
ans = tmp.group(1)
W = recipeWeb+url_sa+ans
if not W in FinalList:
FinalList.append(W)
cnt = 0
for web_url in FinalList:
cnt += 1
GetPic(web_url, cnt)