# coding:utf-8 import base64 import random, re import sqlite3 import redis, pickle import json, time import urllib3,urllib2,hashlib from datetime import datetime import threading import logging.handlers import sys reload(sys) sys.setdefaultencoding('utf-8') import uuid import requests session = requests.session() #把连接加密成 MD5 生成唯一的主键 def md5(str): import hashlib m = hashlib.md5() m.update(str) return m.hexdigest() def jinri(): list_data = [] for i in range(1,20): #请求得到url 链接 url = "http://www.toutiao.com/api/pc/feed/" data = { "category":"news_game", "utm_source":"toutiao", "widen":str(i), "max_behot_time":"0", "max_behot_time_tmp":"0", "tadrequire":"true", "as":"479BB4B7254C150", "cp":"7E0AC8874BB0985", } headers = { "Host":"www.toutiao.com", "Connection":"keep-alive", "Accept":"text/javascript, text/html, application/xml, text/xml, */*", "X-Requested-With":"XMLHttpRequest", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Content-Type":"application/x-www-form-urlencoded", "Referer":"http://www.toutiao.com/ch/news_hot/", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", } result1 = session.get(url=url,params=data,headers=headers).text result2 =json.loads(result1) if result2["message1"] =="success": for i in result2["data"]: source_url =i["source_url"] headers = { "Host":"www.toutiao.com", "Connection":"keep-alive", "Cache-Control":"max-age=0", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", } url1 = "http://www.toutiao.com" + str(source_url) try: return_data = session.get(url=url1, headers=headers).content except: pass # print return_data try: contentData = re.findall(' <article>(.*?)</article>',return_data)[0] except: contentData = "" cx = sqlite3.connect("C:\\Users\\xuchunlin\\PycharmProjects\\study\\db.sqlite3",check_same_thread=False) cx.text_factory = str try: print "正在插入链接 %s 数据" % (url) chinese_ta = i["chinese_tag"] media_avatar_url = i["media_avatar_url"] is_feed_ad = i["is_feed_ad"] tag_url = i["tag_url"] title = i["title"] tag = i["tag"] label = str(i["label"]) abstract = i["abstract"] source_url = i["source_url"] print title print chinese_ta print media_avatar_url print is_feed_ad print tag_url print tag print label print abstract print source_url url2 = md5(str(url1)) cx.execute("INSERT INTO toutiao (title,chinese_ta,media_avatar_url,is_feed_ad,tag_url,tag,label,abstract,source_url,url,contentData)VALUES (?,?,?,?,?,?,?,?,?,?,?)", (str(title), str(chinese_ta), str(media_avatar_url), str(is_feed_ad), str(tag_url), str(tag), str(label), str(abstract), str(source_url), str(url2),str(contentData))) cx.commit() # time.sleep(2) except Exception as e: print e print "cha ru shi bai " cx.close() else: print "请求失败" return list_data print jinri()
爬虫很简单,难的是自己去分析网页解析网页和爬虫的效率