python获取今日头条搜索信息_python 爬取今日头条关键词搜索

使用python 获取今日头条的关键词的文章

使用进程池

代码如下:

# -*- coding: utf-8 -*-

import requests

import random

import requests

import json

import time

import hashlib

from utils.img_to_tencent import img_to_tencent

def md5(str):

return hashlib.md5(str.encode('utf-8')).hexdigest()

PC_UAS = [

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0',

'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0',

'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0',

'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'

]

def crawl_baidu(word):

pc_headers = {

'User-Agent': random.sample(PC_UAS, 1)[0],

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, br',

'Cache-Control': 'no-cache',

'Pragma': 'no-cache',

'Sec-Fetch-Mode': 'navigate',

'Sec-Fetch-Site': 'same-origin',

'Sec-Fetch-User': '?1',

'Upgrade-Insecure-Requests': '1',

"Accept - Language": "zh - CN, zh;q = 0.9",

"Cookie": "tt_webid=6710713392061285902; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6710713392061285902; UM_distinctid=16bc9db8a29f6-0417349b599406-516d3e71-13c680-16bc9db8a2d85; csrftoken=5eb2a0e00bcbb888f417ef261ee5269a; CNZZDATA1259612802=1761938442-1562456487-https%253A%252F%252Fwww.baidu.com%252F%7C1562461887; s_v_web_id=ddb620b1224506f21ba99de20d4169e3; __tasessionId=ned"

"4t635k1562467258609"

}

result = []

for i in range(0,3):

page = i * 20

html_text = ''

news= ''

for j in range(3):

url = "https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=%s&format=json&keyword=%s&autoload=true" \

"&count=20&cur_tab=1"%(int(page),word)

try:

proxies = { 'https': 'http://*******8*****8','http': 'http://*******8**8****8'}

resp = requests.get(url, headers=pc_headers, proxies=proxies,timeout=20)

time.sleep(0.5)

html_text = resp.text

resp.keep_alive = False

resp.close()

data = json.loads(html_text)

news = data['data']

if not news:

continue

else:

pass

except Exception as e:

print(e)

time.sleep(0.1)

continue

break

if news:

data = json.loads(html_text)

news = data['data']

for i in news:

try:

data_title = i['title']

real_url = i['url']

data_showtime = i['display']['info']['time_factor']

author_name = i['display']['info']['site_name']

author_imgs = i['display']['info']['icon_img']

if not author_imgs:

author_imgs = ''

data_imgs = i['display']['info']['images']

if data_imgs:

data_imgs = data_imgs[0]

img_to_tencent(data_imgs)

else:

data_imgs = ''

data_content = i['display']['summary']['text']

except:

pass

else:

img_to_tencent(author_imgs)

if 'ä' in data_title or 'ä' in(data_content):

pass

else:

print(real_url, data_title, data_imgs, data_content, data_showtime, author_name, author_imgs, word)

mysql_config = {"host": "*****888888",

"port": *****6,

'user': "root",

"passwd": "***88",

"db": "*********",

"charset": "utf8"}

conn = MySQLdb.connect(**mysql_config)

cursor = conn.cursor()

target_url_md5 = md5(real_url)

cursor.execute("select source_keywords from crawl_result where target_url_md5=%s",

(target_url_md5,))

dat = cursor.fetchone()

if dat:

source_keywords = dat[0]

if word not in source_keywords.strip().split(","):

source_keywords += ",%s" % word

source_keywords = ','.join(list(set(source_keywords.split(","))))

cursor.execute("update crawl_result set source_keywords=%s where target_url_md5=%s",

(source_keywords, target_url_md5))

conn.commit()

print('ok1111')

else:

if data_content:

cursor = conn.cursor()

cursor.execute(

"insert into crawl_result(target_url,target_url_md5,addtime,data_title,data_imgs,data_content,data_showtime,data_json,source,source_keywords,state,author_name,author_imgs,author_id,author_json) "

"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",

(real_url, target_url_md5, int(time.time()), data_title, data_imgs, data_content,

data_showtime,'', 6, word, 0, author_name, author_imgs, '', ''))

conn.commit()

print('ok')

if __name__ == '__main__':

from multiprocessing.dummy import Pool

pool = Pool(20)

# kws_list = ['化妆水']

pool.map(crawl_baidu, kws_list)

cursor.close()

conn.close()

原文链接:https://blog.csdn.net/zhplz123/article/details/106283019

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值