话不多说,上代码:
class get_article_content(object):#对象属性只有传入的是初始值和末位置
"""抓取文章内容类"""
.............................省略
def run(initial_value, last_value):
article_urls = get_article_content.read_csv()#读取原始url
datas = get_article_content.crawl_article(article_urls, initial_value, last_value)#数据抓取
get_article_content.save_keyword_retrieval(datas)#数据保存
class thread_crawl(object):
"""线程类"""
def __init__(self):
pass
def five_threads():
#抓取文章内容并进行检索关键词保存,5个线程
all_article_num = len([i for i in get_article_content.read_csv()])
tem = all_article_num // 5
t1 = threading.Thread(target=get_article_content.run,args=[0, tem])
t2 = threading.Thread(target=get_article_content.run,args=[tem, tem*2])
t3 = threading.Thread(target=get_article_content.run,args=[tem*2, tem*3])
t4 = threading.Thread(target=get_article_content.run,args=[tem*3, tem*4])
t5 = threading.Thread(target=get_article_content.run,args=[tem*4, all_article_num])
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
while 1:#如果线程都结束,就排序
if threading.activeCount() - 1 == 0:
sort_article.sort_it('keyword_retrieval.csv')#文章排序
break
time.sleep(10)