python爬取新闻存入数据库_Python爬虫-爬取新闻网站,将数据对比去重插入数据库...

import datetime

import json

import time

import requests

import pandas as pd

from lxml import etree

from bs4 import BeautifulSoup

import pymysql

class Spider36Kr(object):

def __init__(self):

self.conn = pymysql.connect( # 链接MYSQL

host='localhost',

user='root',

passwd='963369',

db='news_data',

port=3306,

charset='utf8'

)

self.index = self.get_index() # 获取数据库中最后一个id

self.add_index = list() # 添加的id

self.url_list = ["https://36kr.com/information/technology", "https://36kr.com/information/travel",

"https://36kr.com/information/happy_life", "https://36kr.com/information/real_estate",

"https://36kr.com/information/web_zhichang"]

self.headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",

"Cookie": "acw_tc=2760823515711283155342601ebecc825e95a1d72e78ea3a1a457be0ef9d9d; kr_stat_uuid=kw8ZH26185472; krnewsfrontss=32b5a2ca9ace80d37d4885b144118ef8; M-XSRF-TOKEN=f204eeea5347017f38009858d2ee0eafb2894283c8ba69c228e3837114675d0d; M-XSRF-TOKEN.sig=GQU3yBNWi1oqskE4i2J0jyRpH8BpH13GLSsJ0sqFrDI; Hm_lvt_713123c60a0e86982326bae1a51083e1=1572744686,1572749871,1572825196,1572829918; Hm_lvt_1684191ccae0314c6254306a8333d090=1572744686,1572749871,1572825196,1572829918; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22kw8ZH26185472%22%2C%22%24device_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22https%3A%2F%2F36kr.com%2Finformation%2Ftechnology%22%2C%22%24latest_referrer_host%22%3A%2236kr.com%22%2C%22%24latest_traffic_source_type%22%3A%22%E5%BC%95%E8%8D%90%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%2C%22first_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%7D; Hm_lpvt_1684191ccae0314c6254306a8333d090=1572830180; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1572830180; SERVERID=6754aaff36cb16c614a357bbc08228ea|1572830181|1572829919",

}

self.deep_headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",

"Cookie": "acw_tc=2760823515711283155342601ebecc825e95a1d72e78ea3a1a457be0ef9d9d; kr_stat_uuid=kw8ZH26185472; krnewsfrontss=32b5a2ca9ace80d37d4885b144118ef8; M-XSRF-TOKEN=f204eeea5347017f38009858d2ee0eafb2894283c8ba69c228e3837114675d0d; M-XSRF-TOKEN.sig=GQU3yBNWi1oqskE4i2J0jyRpH8BpH13GLSsJ0sqFrDI; Hm_lvt_713123c60a0e86982326bae1a51083e1=1572744686,1572749871,1572825196,1572829918; Hm_lvt_1684191ccae0314c6254306a8333d090=1572744686,1572749871,1572825196,1572829918; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22kw8ZH26185472%22%2C%22%24device_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%7D; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1572830477; Hm_lpvt_1684191ccae0314c6254306a8333d090=1572830477; SERVERID=6754aaff36cb16c614a357bbc08228ea|1572830478|1572829919",

"authority": "36kr.com"

}

def __del__(self):

print("关闭数据库链接...")

self.conn.close()

def run(self):

for x, url in enumerate(self.url_list):

print("正在爬取:", url, x + 1)

data = self.request_page(url, x + 1) # 获取有效数据

try:

self.insert_data(data) # 入库

except pymysql.err.InternalError:

print("插入数据中出现异常")

print("进入20秒睡眠...")

time.sleep(20)

def get_index(self):

sql = 'select id from new'

new = pd.read_sql(sql, self.conn).tail(1)["id"].tolist()[0]

return new

def spider_one(self, num):

"""

爬取一个页面

:param num:

:return:

"""

url_list = ["https://36kr.com/information/technology", "https://36kr.com/information/travel",

"https://36kr.com/information/happy_life", "https://36kr.com/information/real_estate",

"https://36kr.com/information/web_zhichang"]

data = self.request_page(url_list[num - 1], num)

self.insert_data(data)

def request_page(self, temp_url, cate_id):

"""

请求页面数据

:return: 去重后的数据

"""

response = requests.get(temp_url, self.headers) # 发送请求

content = response.content.decode() # 解析数据

html = etree.HTML(content) # 转换格式

data_list = html.xpath("//script") # 提取数据

temp_data = None # 提取js中的json

for data in data_list:

try:

data = str(data.text).split("window.initialState=")[1]

temp_data = json.loads(data)

except IndexError:

pass

data_all = list() # 提取新闻数据

for x in range(10000):

try:

new_source = "36kr"

new_title = temp_data["information"]["informationList"][x]["title"]

index_image_url = temp_data["information"]["informationList"][x]["images"][0]

new_time = datetime.datetime.now().strftime('%Y-%m-%d')

digest = temp_data["information"]["informationList"][x]["summary"]

url = "https://36kr.com/p/" + str(temp_data["information"]["informationList"][x]["entity_id"])

new_content = self.deep_spider(url)

if str(new_title) not in self.sql_title_list(cate_id):

data_all.append(

[new_title, new_source, new_time, digest, index_image_url, new_content, 0, 0, cate_id])

print(x + 1, new_title, "提取完成...")

else:

print(x + 1, new_title, "检测到重复数据...")

except IndexError:

print("数据提取完成...")

break

data_all.sort()

data_all = pd.DataFrame(data_all, columns=["new_title", "new_source", "new_time", "digest", "index_image_url",

"new_content", "new_seenum", "new_disnum", "new_cate_id"])

return data_all

def deep_spider(self, url):

"""

提取新闻url里面的数据

:param url:

:return:

"""

response = requests.get(url, self.deep_headers)

content = response.content.decode()

soup = BeautifulSoup(content, "lxml")

data = soup.find_all('p')[0: -11]

data_str = ""

for x in data:

data_str = data_str + str(x)

return data_str

def sql_title_list(self, num):

"""

读取MYSQL中新闻的数据

:return: df(标题列表)

"""

if num == 1:

df = pd.read_sql("select * from new where new_cate_id = 1 and new_source = '36kr';",

self.conn) # 读取MySql 科技

elif num == 2:

df = pd.read_sql("select * from new where new_cate_id = 2 and new_source = '36kr';",

self.conn) # 读取MySql 汽车

elif num == 3:

df = pd.read_sql("select * from new where new_cate_id = 3 and new_source = '36kr';",

self.conn) # 读取MySql 生活

elif num == 4:

df = pd.read_sql("select * from new where new_cate_id = 4 and new_source = '36kr';",

self.conn) # 读取MySql 房产

elif num == 5:

df = pd.read_sql("select * from new where new_cate_id = 5 and new_source = '36kr';",

self.conn) # 读取MySql 职场

else:

df = pd.read_sql("select * from new where new_cate_id = 3 and new_source = '36kr';", self.conn)

df = df.tail(40)["new_title"].to_list()

return df

def insert_data(self, data):

cursor = self.conn.cursor() # 创建游标

sql = "insert into new(new_time, index_image_url, new_title, new_source," \

" new_seenum, new_disnum, digest, new_content, new_cate_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)"

print(data.shape, "正在将数据插入数据库...")

for x, y, z, e, f, j, h, i, g in zip(data["new_time"], data["index_image_url"], data["new_title"],

data["new_source"], data["new_seenum"],

data["new_disnum"],

data["digest"], data["new_content"], data["new_cate_id"]):

cursor.execute(sql, (x, y, z, e, f, j, h, i, g))

self.conn.commit()

print(z, "插入成功...")

self.index += 1

self.add_index.append(self.index)

cursor.close()

if __name__ == '__main__':

spider = Spider36Kr()

spider.run()

# spider.spider_one(5)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值