#coding: utf-8
import requests, time
from datetime import datetime
from multiprocessing.dummy import Pool
def execute(i):
global countFail, failed
songs = None
success = None
# Raw Data Extract: Start
(songName, artist, albumName) = (i['data']['songorig'], i['data']['singer'][0]['name'], i['data']['albumname'])
# Raw Data Extract: End
url = 'https://itunes.apple.com/search?country=cn&entity=song&limit=15&term=' + (songName + '+' + artist + '+' + albumName).replace(' ', '+')
try:
songs = requests.get(url).json()
except:
print('Try error 1 : ' + url)
if songs is not None and songs['resultCount'] == 0:
url = 'https://itunes.apple.com/search?country=cn&entity=song&limit=15&term=' + (songName + '+' + artist).replace(' ', '+')
try:
songs = requests.get(url).json()
except:
print('Try error 2 : ' + url)
if songs is not None and songs['resultCount'] > 0:
trackIdBackup = songs['results'][0]['trackId']
trackId = None
for j in songs['results']:
songName = songName.lower()
songNameQuery = j['trackName'].lower()
albumName = albumName.lower()
albumNameQuery = j['collectionName'].lower()
if songName in songNameQuery and 'acoustic' not in albumNameQuery and 'live' not in albumNameQuery and (('mix' not in albumName and 'mix' not in albumNameQuery) or ('mix' in albumName and 'mix' in albumNameQuery)):
trackId = j['trackId']
break
if trackId:
success = trackId
else:
success = trackIdBackup
else:
failed += [str(countFail) + '. ' + songName + ' by ' + artist]
countFail += 1
return success if success is not None else None
if __name__ == '__main__':
countFail = 1
failed = []
print('Running in Multithreading.')
begin = datetime.now()
print('Begin: ' + time.ctime())
# Raw Data Fetch: Start
url = 'http://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?format=json&inCharset=utf-8&outCharset=utf-8&page=detail&type=top&topid=108'
songList = requests.get(url).json()['songlist']
# Raw Data Fetch: End
pool = Pool(30)
res = pool.map(execute, songList)
pool.close()
pool.join()
results = [x for x in res if x]
content = {'results': results, 'failed': failed}
end = datetime.now()
print('End: ' + time.ctime())
print('Total: ' + str((end - begin).seconds) + 'seconds')
对比
直接处理 Billboard Hot 100 的 100 首单曲,分别用单线程和多线程方法简单执行几次任务,对比其运行效率。结果是十分的明显:
尽管 iTunes Search API 可能有缓存,实际效果不一定那么明显,但很显而易见的是它们的效率不在一个量级上。
总结
如果使用 PHP 模拟异步方法实现并发,其操作复杂程度要远远大于 Python。Python 提供的方法简单得只需要增加约 4 句代码,虽然它们之间没有可比性,但在数据处理方面确实无可比拟。针对循环任务,并发方法还是十分推荐的。
说明iTunes Search API 有限流机制,过于频繁发送请求会直接返回 403 禁止。所以大规模地并发请求实际上并没有什么必要,此例仅作调试使用;
实际上在 Web 端也不会有这么好的结果,主要是带宽受限,请求的响应没有自家网络快,也可能是我的机器太渣了;
类似的 iTunes Search TrackId 方法理论上只要修改原始数据获取和原始数据解析的部分(见代码内注释)即可套用,不过我相信没有多少人有这样的需求。
如有问题,欢迎留言或邮件咨询