妹子图:
import requests from urllib import request,parse from lxml import etree import os path = 'D:/Python/meizi' os.chdir(path) def meizi(page_all,queue): i = 1 if i <= page_all: meizi_page = 'page/'+str(i)+'/' url = 'http://www.mzitu.com/hot/'+meizi_page headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.mzitu.com', 'Referer':'http://www.mzitu.com/hot/', 'Save-Data':'on', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } # proxy = { # 'http':'http://118.190.95.35:9001' # } response = requests.get(url,headers = headers) res = response.text res_ele = etree.HTML(res) url_ele = res_ele.xpath('//ul[@id="pins"]/li/a/@href') for info_url_list in url_ele: response = requests.get(info_url_list,headers = headers) info_res = response.text info_ele = etree.HTML(info_res) # print(info_ele) page_ele = info_ele.xpath('//div[@class="pagenavi"]/a[5]/span/text()') # print(page_ele) for x in page_ele: # print(x) for i in range(1,int(x)): page_lists = info_url_list + '/'+ str(i) # print(page_lists) queue.put((page_lists, info_url_list)) i += 1 def meizi_downlode(img_url_referer_url): (page_lists, info_url_list) = img_url_referer_url response = requests.get(page_lists) list_res = response.text list_ele = etree.HTML(list_res) jpg_url_list = list_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0] print(jpg_url_list) # print(meizi_url) headers = { 'Referer': info_url_list, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } meizi_name = jpg_url_list.split('/')[-1] # print(meizi_name) res = requests.get(jpg_url_list, headers=headers ) print(res) with open(meizi_name, 'wb') as f: f.write(res.content) # with open('meizi.html','wb') as f: # f.write(response.content) if __name__ == '__main__': # 创建一线程或者进程 import multiprocessing from multiprocessing import Queue, Pool # 以下三行主要是获取image的url, 放到我们的queue中 q = Queue() p = multiprocessing.Process(target=meizi, args=(2,q, )) p.start() # 定义五个process download_pool = Pool(50) while True: try: image_url_referer_url = q.get(timeout=20) print(image_url_referer_url) # 将任务 download_pool.apply_async(meizi_downlode, (image_url_referer_url,)) except: break download_pool.close() download_pool.join() # 程序最后退出前进行join p.join()
喜马拉雅:
import requests from urllib import request,parse from lxml import etree import os url = 'https://www.ximalaya.com/xiangsheng/61/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' } # xpach = '//*[@id="root"]/main/section/div/div[2]/div[1]/div[2]/div[2]/ul/li[1]/div[2]/a' response = requests.get(url,headers = headers) res = response.text res_ele = etree.HTML(res) # print(res_ele) url_ele = res_ele.xpath('//ul[@class="dOi2"]/li/div[2]/a/@href') # print(url_ele) for list_url in url_ele: list_url_all = parse.urljoin(url,list_url) # print(list_url_all) info_id = list_url_all.split('/')[-1] # print(info_id) url = 'https://www.ximalaya.com/revision/play/tracks?trackIds='+str(info_id) # print(url) response = requests.get(url,headers = headers) # print(response.text) info_list = response.json() path = 'D:/Python/sound' os.chdir(path) mp3_url = info_list['data']['tracksForAudioPlay'][0]['src'] # print(mp3_url) mp3Name = info_list['data']['tracksForAudioPlay'][0]['trackName'] request.urlretrieve(mp3_url,mp3Name+'.m4a') # with open('xmly.html','wb') as f : # f.write(response.content)