import asyncio import time import re import multiprocessing as mp import os import aiohttp from scrapy import Selector import xlwt # from apscheduler.schedulers.asyncio import AsyncIOScheduler # from apscheduler.schedulers.background import BlockingScheduler # scheduler = BlockingScheduler() # scheduler = AsyncIOScheduler() base_url = "http://www.qingting.fm/channels/53972" # base_url = "http://127.0.0.1:4000/" # DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAIN if base_url != "http://127.0.0.1:4000/": restricted_crawl = True else: restricted_crawl = False seen = set() unseen = set([base_url]) sema = asyncio.Semaphore(3) # count = 0 BOOK = xlwt.Workbook() sheet = BOOK.add_sheet('qingting') row = ['album_name', 'album_image', 'album_introduce', 'program_name', 'program_mp3_url'] # list.append(url) for i in range(len(row)): sheet.write(0, i, row[i]) def parse(html,url): if url == base_url: selector = Selector(text=html) page_urls = set() # get album fileds album_introduce = selector.css('div[data-reactid]>div[title]::attr(title)').extract_first() album_name = selector.css('h1[class="_3h7q"]::text').extract_first() album_image = selector.css('div[class="_115e _2p_y"]>img::attr(src)').extract_first() # get album page_counts total_url = selector.css('ul>li:nth-last-child(2)>a::text').extract_first() total_url = int(total_url) print(total_url) # get program url and produce request_mp3_filepath_urls program_url = selector.css('li[class="_1EVW"]>span[class="_3xHH"]>a::attr(href)').extract() # produce next pages and put into page_urls next_urls = ['http://i.qingting.fm/wapi/channels/53972/programs/page/{}/pagesize/10'.format(i) for i in range(1,total_url+1)] page_urls.update(next_urls) return album_introduce,album_name,album_image,page_urls,total_url elif 'http://i.qingting.fm/wapi/channels/53972/programs/page' in url: dict_data = eval(html) print('dict_data',dict_data) if ('data' in dict_data.keys()): list_pro_data = [] for dict_id in dict_data['data']: file_path = 'http://od.qingting.fm/' + dict_id['file_path'].replace('\\','') name = dict_id['name'] program_data = file_path + "#$#" + name print(program_data) list_pro_data.append(program_data) return list_pro_data async def crawl(url,session): # with (await sema): r = await session.get(url) html = await r.text() print(html) await asyncio.sleep(0.1) print('url',url) return html,url async def main(loop): pool = mp.Pool(8) # slightly affected async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session: count = 0 while len(unseen) != 0: print('\nAsync Crawling...') tasks = [loop.create_task(crawl(url, session)) for url in unseen] finished, unfinished = await asyncio.wait(tasks) htmls = [f.result() for f in finished] print('\nDistributed Parsing...') parse_jobs = [pool.apply_async(parse, args=(html,url)) for html,url in htmls] results = [j.get() for j in parse_jobs] print('results', results) print('\nAnalysing...') seen.update(unseen) # print() print('\033[32;1mseen\033[0m',len(seen)) unseen.clear() if len(results[0]) == 5: # list = [] print('results_N1:', results) for album_introduce,album_name,album_image,page_urls,total_url in results: print('page_urls_1:',page_urls) print(len(page_urls)) for url in page_urls: with open('get_requested_mp3_url.txt','a')as f2: f2.write(url+ '\n') for id in range(1,(total_url)*10+1): sheet.write(id,0,album_name) sheet.write(id, 1, album_image) sheet.write(id, 2, album_introduce) unseen.update(page_urls - seen) else: print('results_{}'.format(count),results) print('results_{}_len'.format(count), len(results)) for result in results: for program in result: count += 1 file_path = program.split('#$#')[0] name = program.split('#$#')[-1] sheet.write(count, 3, name) sheet.write(count, 4, file_path) BOOK.save('qingting_fm_三字经.xls') if __name__ == "__main__": # def run(): t1 = time.time() loop = asyncio.get_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(main(loop)) print("Async total time: ", time.time() - t1)
Python---爬虫---速---asyncio
最新推荐文章于 2023-12-29 14:59:43 发布