使用python 获取今日头条的关键词的文章
使用进程池
代码如下:
# -*- coding: utf-8 -*-
import requests
import random
import requests
import json
import time
import hashlib
from utils.img_to_tencent import img_to_tencent
def md5(str):
return hashlib.md5(str.encode('utf-8')).hexdigest()
PC_UAS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'
]
def crawl_baidu(word):
pc_headers = {
'User-Agent': random.sample(PC_UAS, 1)[0],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
"Accept - Language": "zh - CN, zh;q = 0.9",
"Cookie": "tt_webid=6710713392061285902; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6710713392061285902; UM_distinctid=16bc9db8a29f6-0417349b599406-516d3e71-13c680-16bc9db8a2d85; csrftoken=5eb2a0e00bcbb888f417ef261ee5269a; CNZZDATA1259612802=1761938442-1562456487-https%253A%252F%252Fwww.baidu.com%252F%7C1562461887; s_v_web_id=ddb620b1224506f21ba99de20d4169e3; __tasessionId=ned"
"4t635k1562467258609"
}
result = []
for i in range(0,3):
page = i * 20
html_text = ''
news= ''
for j in range(3):
url = "https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=%s&format=json&keyword=%s&autoload=true" \
"&count=20&cur_tab=1"%(int(page),word)
try:
proxies = { 'https': 'http://*******8*****8','http': 'http://*******8**8****8'}
resp = requests.get(url, headers=pc_headers, proxies=proxies,timeout=20)
time.sleep(0.5)
html_text = resp.text
resp.keep_alive = False
resp.close()
data = json.loads(html_text)
news = data['data']
if not news:
continue
else:
pass
except Exception as e:
print(e)
time.sleep(0.1)
continue
break
if news:
data = json.loads(html_text)
news = data['data']
for i in news:
try:
data_title = i['title']
real_url = i['url']
data_showtime = i['display']['info']['time_factor']
author_name = i['display']['info']['site_name']
author_imgs = i['display']['info']['icon_img']
if not author_imgs:
author_imgs = ''
data_imgs = i['display']['info']['images']
if data_imgs:
data_imgs = data_imgs[0]
img_to_tencent(data_imgs)
else:
data_imgs = ''
data_content = i['display']['summary']['text']
except:
pass
else:
img_to_tencent(author_imgs)
if 'ä' in data_title or 'ä' in(data_content):
pass
else:
print(real_url, data_title, data_imgs, data_content, data_showtime, author_name, author_imgs, word)
mysql_config = {"host": "*****888888",
"port": *****6,
'user': "root",
"passwd": "***88",
"db": "*********",
"charset": "utf8"}
conn = MySQLdb.connect(**mysql_config)
cursor = conn.cursor()
target_url_md5 = md5(real_url)
cursor.execute("select source_keywords from crawl_result where target_url_md5=%s",
(target_url_md5,))
dat = cursor.fetchone()
if dat:
source_keywords = dat[0]
if word not in source_keywords.strip().split(","):
source_keywords += ",%s" % word
source_keywords = ','.join(list(set(source_keywords.split(","))))
cursor.execute("update crawl_result set source_keywords=%s where target_url_md5=%s",
(source_keywords, target_url_md5))
conn.commit()
print('ok1111')
else:
if data_content:
cursor = conn.cursor()
cursor.execute(
"insert into crawl_result(target_url,target_url_md5,addtime,data_title,data_imgs,data_content,data_showtime,data_json,source,source_keywords,state,author_name,author_imgs,author_id,author_json) "
"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(real_url, target_url_md5, int(time.time()), data_title, data_imgs, data_content,
data_showtime,'', 6, word, 0, author_name, author_imgs, '', ''))
conn.commit()
print('ok')
if __name__ == '__main__':
from multiprocessing.dummy import Pool
pool = Pool(20)
# kws_list = ['化妆水']
pool.map(crawl_baidu, kws_list)
cursor.close()
conn.close()