爬虫新闻+进度条打印
def read_file(path1):
res = []
for ea in list(glob(r'{}'.format(path1))):
print(ea)
sheet = pd.read_csv(ea,encoding='utf-8')
res.append(sheet)
res = pd.concat(res,ignore_index=True)
return res
import inspect
def getnews(url):
import numpy as np
from newspaper import Article
news = Article(str(url),language='zh')
news.download()
try:
news.parse()
return news.text
except:
return np.nan
with open(f'./tmp_func.py','w') as file:
file.write(inspect.getsource(getnews).replace(getnews.__name__,"task"))
from tmp_func import task
import tqdm
def get_newspaper_mp(path1,path2):
sheet = read_file(path1)
n_cores = mp.cpu_count()
p = mp.Pool(processes = n_cores-2)
urls = sheet['采集网址'].tolist()
#tqdm库下的tqdm方法:tqdm.tqdm(iterable,total)
res = list(tqdm.tqdm(p.imap(task,urls),total = len(urls)))
p.close()
p.join()
sheet['内容'] = res
sheet.to_csv(r'{}'.format(path2),encoding='utf-8')
return sheet
from glob import glob
import pandas as pd
import numpy as np
from newspaper import Article
import threading
import numba
import multiprocessing as mp
import tqdm
if __name__ == '__main__':
path1 = 'D:\\导师任务\\2016-2021新闻\\爬取新闻test\\test*.csv'
path2 = 'D:\\导师任务\\2016-2021新闻\\test新闻.csv'
get_newspaper_mp(path1,path2)