Python爬虫-爬取新闻网站,将数据对比去重插入数据库

一、近期想实现一个推荐系统的API,目前正在筹备中,实现了一个新闻网站,用来做测试(大家可以看我以前的文章)今天分享的就是为我的新闻网站提供数据的爬虫代码

  1. 先看效果
    在这里插入图片描述
    在这里插入图片描述
    检测到重复数据时程序是不会插入到数据库中的
二、实现思路
  1. 获取数据库以经存在的数据提取末位30
  2. 每次爬取数据只爬页面前30条数据
  3. 在爬取过程中做一个简单的对比(title)
三、源码
import datetime
import json
import time
import requests
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import pymysql


class Spider36Kr(object):
    def __init__(self):
        self.conn = pymysql.connect(  # 链接MYSQL
            host='localhost',
            user='root',
            passwd='963369',
            db='news_data',
            port=3306,
            charset='utf8'
        )
        self.index = self.get_index()  # 获取数据库中最后一个id
        self.add_index = list()  # 添加的id
        self.url_list = ["https://36kr.com/information/technology", "https://36kr.com/information/travel",
                         "https://36kr.com/information/happy_life", "https://36kr.com/information/real_estate",
                         "https://36kr.com/information/web_zhichang"]
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "Cookie": "acw_tc=2760823515711283155342601ebecc825e95a1d72e78ea3a1a457be0ef9d9d; kr_stat_uuid=kw8ZH26185472; krnewsfrontss=32b5a2ca9ace80d37d4885b144118ef8; M-XSRF-TOKEN=f204eeea5347017f38009858d2ee0eafb2894283c8ba69c228e3837114675d0d; M-XSRF-TOKEN.sig=GQU3yBNWi1oqskE4i2J0jyRpH8BpH13GLSsJ0sqFrDI; Hm_lvt_713123c60a0e86982326bae1a51083e1=1572744686,1572749871,1572825196,1572829918; Hm_lvt_1684191ccae0314c6254306a8333d090=1572744686,1572749871,1572825196,1572829918; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22kw8ZH26185472%22%2C%22%24device_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22https%3A%2F%2F36kr.com%2Finformation%2Ftechnology%22%2C%22%24latest_referrer_host%22%3A%2236kr.com%22%2C%22%24latest_traffic_source_type%22%3A%22%E5%BC%95%E8%8D%90%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%2C%22first_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%7D; Hm_lpvt_1684191ccae0314c6254306a8333d090=1572830180; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1572830180; SERVERID=6754aaff36cb16c614a357bbc08228ea|1572830181|1572829919",
        }
        self.deep_headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "Cookie": "acw_tc=2760823515711283155342601ebecc825e95a1d72e78ea3a1a457be0ef9d9d; kr_stat_uuid=kw8ZH26185472; krnewsfrontss=32b5a2ca9ace80d37d4885b144118ef8; M-XSRF-TOKEN=f204eeea5347017f38009858d2ee0eafb2894283c8ba69c228e3837114675d0d; M-XSRF-TOKEN.sig=GQU3yBNWi1oqskE4i2J0jyRpH8BpH13GLSsJ0sqFrDI; Hm_lvt_713123c60a0e86982326bae1a51083e1=1572744686,1572749871,1572825196,1572829918; Hm_lvt_1684191ccae0314c6254306a8333d090=1572744686,1572749871,1572825196,1572829918; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22kw8ZH26185472%22%2C%22%24device_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%2C%22first_id%22%3A%2216dce8bab54a83-0d3eedac9e4ffb-e343166-1327104-16dce8bab55a79%22%7D; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1572830477; Hm_lpvt_1684191ccae0314c6254306a8333d090=1572830477; SERVERID=6754aaff36cb16c614a357bbc08228ea|1572830478|1572829919",
            "authority": "36kr.com"

        }

    def __del__(self):
        print("关闭数据库链接...")
        self.conn.close()

    def run(self):
        for x, url in enumerate(self.url_list):
            print("正在爬取:", url, x + 1)
            data = self.request_page(url, x + 1)  # 获取有效数据
            try:
                self.insert_data(data)  # 入库
            except pymysql.err.InternalError:
                print("插入数据中出现异常")
            print("进入20秒睡眠...")
            time.sleep(20)

    def get_index(self):
        sql = 'select id from new'
        new = pd.read_sql(sql, self.conn).tail(1)["id"].tolist()[0]
        return new

    def spider_one(self, num):
        """
        爬取一个页面
        :param num:
        :return:
        """
        url_list = ["https://36kr.com/information/technology", "https://36kr.com/information/travel",
                    "https://36kr.com/information/happy_life", "https://36kr.com/information/real_estate",
                    "https://36kr.com/information/web_zhichang"]
        data = self.request_page(url_list[num - 1], num)
        self.insert_data(data)

    def request_page(self, temp_url, cate_id):
        """
        请求页面数据
        :return: 去重后的数据
        """
        response = requests.get(temp_url, self.headers)  # 发送请求
        content = response.content.decode()  # 解析数据
        html = etree.HTML(content)  # 转换格式
        data_list = html.xpath("//script")  # 提取数据

        temp_data = None  # 提取js中的json
        for data in data_list:
            try:
                data = str(data.text).split("window.initialState=")[1]
                temp_data = json.loads(data)
            except IndexError:
                pass

        data_all = list()  # 提取新闻数据
        for x in range(10000):
            try:
                new_source = "36kr"
                new_title = temp_data["information"]["informationList"][x]["title"]
                index_image_url = temp_data["information"]["informationList"][x]["images"][0]
                new_time = datetime.datetime.now().strftime('%Y-%m-%d')
                digest = temp_data["information"]["informationList"][x]["summary"]
                url = "https://36kr.com/p/" + str(temp_data["information"]["informationList"][x]["entity_id"])
                new_content = self.deep_spider(url)
                if str(new_title) not in self.sql_title_list(cate_id):
                    data_all.append(
                        [new_title, new_source, new_time, digest, index_image_url, new_content, 0, 0, cate_id])
                    print(x + 1, new_title, "提取完成...")
                else:
                    print(x + 1, new_title, "检测到重复数据...")
            except IndexError:
                print("数据提取完成...")
                break

        data_all.sort()
        data_all = pd.DataFrame(data_all, columns=["new_title", "new_source", "new_time", "digest", "index_image_url",
                                                   "new_content", "new_seenum", "new_disnum", "new_cate_id"])

        return data_all

    def deep_spider(self, url):
        """
        提取新闻url里面的数据
        :param url:
        :return:
        """
        response = requests.get(url, self.deep_headers)
        content = response.content.decode()
        soup = BeautifulSoup(content, "lxml")
        data = soup.find_all('p')[0: -11]
        data_str = ""
        for x in data:
            data_str = data_str + str(x)
        return data_str

    def sql_title_list(self, num):
        """
        读取MYSQL中新闻的数据
        :return: df(标题列表)
        """
        if num == 1:
            df = pd.read_sql("select * from new where new_cate_id = 1 and new_source = '36kr';",
                             self.conn)  # 读取MySql 科技

        elif num == 2:
            df = pd.read_sql("select * from new where new_cate_id = 2 and new_source = '36kr';",
                             self.conn)  # 读取MySql 汽车

        elif num == 3:
            df = pd.read_sql("select * from new where new_cate_id = 3 and new_source = '36kr';",
                             self.conn)  # 读取MySql 生活

        elif num == 4:
            df = pd.read_sql("select * from new where new_cate_id = 4 and new_source = '36kr';",
                             self.conn)  # 读取MySql 房产

        elif num == 5:
            df = pd.read_sql("select * from new where new_cate_id = 5 and new_source = '36kr';",
                             self.conn)  # 读取MySql 职场
        else:
            df = pd.read_sql("select * from new where new_cate_id = 3 and new_source = '36kr';", self.conn)

        df = df.tail(40)["new_title"].to_list()
        return df

    def insert_data(self, data):
        cursor = self.conn.cursor()  # 创建游标
        sql = "insert into new(new_time, index_image_url, new_title, new_source," \
              " new_seenum, new_disnum, digest, new_content, new_cate_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)"
        print(data.shape, "正在将数据插入数据库...")
        for x, y, z, e, f, j, h, i, g in zip(data["new_time"], data["index_image_url"], data["new_title"],
                                             data["new_source"], data["new_seenum"],
                                             data["new_disnum"],
                                             data["digest"], data["new_content"], data["new_cate_id"]):
            cursor.execute(sql, (x, y, z, e, f, j, h, i, g))
            self.conn.commit()
            print(z, "插入成功...")
            self.index += 1
            self.add_index.append(self.index)
        cursor.close()


if __name__ == '__main__':
    spider = Spider36Kr()
    spider.run()
    # spider.spider_one(5)

注: 大家可以按照自身需求,自行修改(数据库等),仅用于学习交流
在这里插入图片描述
注: 新闻入库,在页面即可看到数据更新,项目做完后我会提供新闻网站和推荐系统的GitHub

  • 5
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
感谢您的提问。我理解您想了解如何用 Python 爬虫爬取雪球网的金融数据。下面是一个简单的爬虫项目,可以帮助您实现这个目标: 1. 确定爬取的目标网页 我们需要确定要爬取的目标网页。在本例中,我们将使用雪球网的金融数据页面:https://xueqiu.com/hq。 2. 使用 requests 库发送 HTTP 请求 我们可以使用 Python 的 requests 库来发送 HTTP 请求并从目标网页获取数据。以下是如何发送 GET 请求的示例代码: ```python import requests url = 'https://xueqiu.com/hq' response = requests.get(url) print(response.text) ``` 3. 解析 HTML 页面 我们可以使用 Python 的 Beautiful Soup 库来解析 HTML 页面。以下是如何使用 Beautiful Soup 库解析响应的 HTML 页面的示例代码: ```python from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # 打印解析后的 HTML 页面 print(soup.prettify()) ``` 4. 提取数据 一旦我们成功解析了 HTML 页面,就可以使用 Beautiful Soup 库中的一些方法来提取我们需要的数据。以下是如何从 HTML 页面中提取股票代码和股票名称的示例代码: ```python # 找到包含股票代码和名称的表格 table = soup.find('table', {'class': 'table'}) # 找到所有的行 rows = table.find_all('tr') # 遍历每一行,并提取股票代码和名称 for row in rows: cols = row.find_all('td') if len(cols) > 0: code = cols[0].text.strip() name = cols[1].text.strip() print(code, name) ``` 5. 存储数据 最后,我们可以将提取的数据存储到文件中。以下是如何将股票代码和名称存储到 CSV 文件中的示例代码: ```python import csv with open('stocks.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Code', 'Name']) for row in rows: cols = row.find_all('td') if len(cols) > 0: code = cols[0].text.strip() name = cols[1].text.strip() writer.writerow([code, name]) ``` 希望这个爬虫项目可以帮助您了解如何用 Python 爬虫爬取雪球网的金融数据。如果您还有其他问题,请随时问我。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值