import multiprocessing
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
fw=open('articles.csv','a',encoding='utf8')
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'
}
hanzi=get_commmon_hanzi()
def parse_page(url,tag):
res=requests.get(url,headers=headers)
try:
text=re.findall(r'content:(.*\;\'\,)?',res.text,re.S)[0]
except:
print(url,tag,res.headers)
return
result=list()
for c in text:
if c in hanzi:
result.append(c)
fw.write('{},{}\n'.format(tag,''.join(result)))
def crawl_news(driver, url):
driver = webdriver.PhantomJS(executable_path='utils/phantomjs.exe')
tag=url.split('/')[-2]
driver.implicitly_wait(5)
driver.get(url)
time.sleep(5)
for _ in range(100):
time.sleep(3)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
soup = BeautifulSoup(driver.page_source, 'lxml')
items = soup.select('.wcommonFeed ul .item')
results=list()
for item in items:
try:
href = item.find('a', class_='link title')['href']
if href.find('/group/') != -1:
results.append(['https://www.toutiao.com{}'.format(href),tag])
except:
pass
driver.close()
return results
def crawl_begin():
news_url=['https://www.toutiao.com/ch/news_tech/','https://www.toutiao.com/ch/news_entertainment/']
items=list()
for url in news_url:
items.extend(crawl_news(url=url))
print(len(items))
random.shuffle(items)
pool = multiprocessing.Pool(processes=4)
for item in items:
pool.apply_async(parse_page, (item[0],item[1]))
pool.close()
pool.join()
print("done.")
if __name__ == '__main__':
crawl_begin()