python爬虫爬取今日头条_Python 爬虫实例(2)—— 爬取今日头条

#coding:utf-8

importbase64importrandom, reimportsqlite3importredis, pickleimportjson, timeimporturllib3,urllib2,hashlibfrom datetime importdatetimeimportthreadingimportlogging.handlersimportsys

reload(sys)

sys.setdefaultencoding('utf-8')importuuidimportrequests

session=requests.session()

#把连接加密成 MD5 生成唯一的主键defmd5(str):importhashlib

m=hashlib.md5()

m.update(str)returnm.hexdigest()defjinri():

list_data=[]for i in range(1,20):

#请求得到url 链接url= "http://www.toutiao.com/api/pc/feed/"data={"category":"news_game","utm_source":"toutiao","widen":str(i),"max_behot_time":"0","max_behot_time_tmp":"0","tadrequire":"true","as":"479BB4B7254C150","cp":"7E0AC8874BB0985",

}

headers={"Host":"www.toutiao.com","Connection":"keep-alive","Accept":"text/javascript, text/html, application/xml, text/xml, */*","X-Requested-With":"XMLHttpRequest","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36","Content-Type":"application/x-www-form-urlencoded","Referer":"http://www.toutiao.com/ch/news_hot/","Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.8",

}

result1= session.get(url=url,params=data,headers=headers).text

result2=json.loads(result1)if result2["message1"] =="success":

for i in result2["data"]:

source_url=i["source_url"]

headers={"Host":"www.toutiao.com","Connection":"keep-alive","Cache-Control":"max-age=0","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.8",

}

url1= "http://www.toutiao.com" +str(source_url)try:

return_data= session.get(url=url1, headers=headers).contentexcept:pass

#print return_data

try:

contentData= re.findall('(.*?)',return_data)[0]except:

contentData= ""cx= sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3",check_same_thread=False)

cx.text_factory=strtry:print "正在插入链接 %s 数据" %(url)

chinese_ta= i["chinese_tag"]

media_avatar_url= i["media_avatar_url"]

is_feed_ad= i["is_feed_ad"]

tag_url= i["tag_url"]

title= i["title"]

tag= i["tag"]

label= str(i["label"])

abstract= i["abstract"]

source_url= i["source_url"]printtitleprintchinese_taprintmedia_avatar_urlprintis_feed_adprinttag_urlprinttagprintlabelprintabstractprintsource_url

url2=md5(str(url1))

cx.execute("INSERT INTO toutiao (title,chinese_ta,media_avatar_url,is_feed_ad,tag_url,tag,label,abstract,source_url,url,contentData)VALUES (?,?,?,?,?,?,?,?,?,?,?)",

(str(title), str(chinese_ta), str(media_avatar_url), str(is_feed_ad), str(tag_url), str(tag), str(label), str(abstract), str(source_url), str(url2),str(contentData)))

cx.commit()#time.sleep(2)

exceptException as e:printeprint "cha ru shi bai"cx.close()else:print "请求失败"

returnlist_dataprint jinri()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值