Python简单

最新推荐文章于 2023-05-04 09:45:40 发布
溯洄℡
最新推荐文章于 2023-05-04 09:45:40 发布
阅读量149
点赞数
分类专栏： Python 文章标签： Python 爬虫 BeautifulSoup MySQL 新闻数据
本文链接：https://blog.csdn.net/qq_43210174/article/details/105393429
版权
Python 专栏收录该内容
12 篇文章 0 订阅
订阅专栏
# 一.爬取三种新闻网站上某些分类下的热点新闻的相关数据利用MySQL存储。
# 1）编写Python程序，使用户通过在控制台输入相关条件，进而查询到想了解的信息。
import requests
from bs4 import BeautifulSoup
import pymysql
"""获取HTML"""
def get_html(url, headers):
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    return r.text

"""获取实时新闻"""
def get_realtime_news(html):
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find_all('tr')[1:]
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    # 创建数据库的sql(如果数据库存在就不创建，防止异常)
    # sql_data = "CREATE DATABASE IF NOT EXISTS News"
    # 执行创建数据库的sql
    # cursor.execute(sql_data)
    cursor.execute("use News")
    # cursor.execute("DROP TABLE IF EXISTS real_time")
    # 使用预处理语句创建表
    sql_create = """CREATE TABLE real_time (
                         rank INT NOT NULL,
                         topic_name  CHAR(40), 
                         heat INT,
                         PRIMARY KEY (`rank`))"""
    # cursor.execute(sql_create)
    for each_topic in all_topics:
        topic_times = each_topic.find('td', class_='last')  # 点击量
        topic_rank = each_topic.find('td', class_='first')  # 排名
        topic = each_topic.find('td', class_='keyword')  # 标题目
        if topic_rank != None and topic != None and topic_times != None:
            topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
            topic = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
            topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
            # SQL 插入语句
            val = [topic_rank, topic, topic_times]
            sql_insert = "INSERT INTO real_time(rank,topic_name,heat) VALUES(%s,%s,%s) "
            try:
               # 执行sql语句
               cursor.execute(sql_insert, val)
               # 提交到数据库执行
               conn.commit()
            except:
               # 如果发生错误则回滚
               conn.rollback()
    # 关闭连接
    cursor.close()
    conn.close()

"""查询实时新闻"""
def select_realtime_news():
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    cursor.execute("use News")
    # SQL 查询语句
    sql_select = "SELECT * FROM real_time"
    try:
        # 执行SQL语句
        cursor.execute(sql_select)
        # 获取所有记录列表
        results = cursor.fetchall()
        for row in results:
            rank = row[0]
            topic_name = row[1]
            heat = row[2]
            # 打印结果
            print("排名：%d ,标题：%s ,热度：%d" % \
                  (rank, topic_name, heat,))
    except:
        print("Error: unable to fetch data")
    # 关闭连接
    cursor.close()
    conn.close()

"""获取娱乐新闻"""
def get_pastime_news(html):
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find_all('tr')[1:]
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    # 创建数据库的sql(如果数据库存在就不创建，防止异常)
    # sql_data = "CREATE DATABASE IF NOT EXISTS News"
    # 执行创建数据库的sql
    # cursor.execute(sql_data)
    cursor.execute("use News")
    cursor.execute("DROP TABLE IF EXISTS pastime_news")
    # 使用预处理语句创建表
    sql_create = """CREATE TABLE pastime_news (
                        rank INT NOT NULL,
                        topic_name  CHAR(40), 
                        heat INT,
                        PRIMARY KEY (`rank`))"""
    cursor.execute(sql_create)
    for each_topic in all_topics:
        topic_times = each_topic.find('td', class_='last')  # 搜索指数
        topic_rank = each_topic.find('td', class_='first')  # 排名
        topic = each_topic.find('td', class_='keyword')  # 标题目
        if topic_rank != None and topic != None and topic_times != None:
            topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
            topic = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
            topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
            # SQL 插入语句
            val = [topic_rank, topic, topic_times]
            sql_insert = "INSERT INTO pastime_news(rank,topic_name,heat) VALUES(%s,%s,%s) "
            try:
               # 执行sql语句
               cursor.execute(sql_insert, val)
               # 提交到数据库执行
               conn.commit()
            except:
               # 如果发生错误则回滚
               conn.rollback()
    # SQL 查询语句
    sql_select = "SELECT * FROM pastime_news"
    try:
       # 执行SQL语句
       cursor.execute(sql_select)
       # 获取所有记录列表
       results = cursor.fetchall()
       for row in results:
          rank = row[0]
          topic_name = row[1]
          heat = row[2]
           # 打印结果
          print ("排名：%d ,标题：%s ,热度：%d" % \
                 (rank, topic_name, heat))
    except:
       print ("Error: unable to fetch data")
    # 关闭连接
    cursor.close()
    conn.close()

"""查询娱乐新闻"""
def select_pastime_news():
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    cursor.execute("use News")
    # SQL 查询语句
    sql_select = "SELECT * FROM pastime_news"
    try:
        # 执行SQL语句
        cursor.execute(sql_select)
        # 获取所有记录列表
        results = cursor.fetchall()
        for row in results:
            rank = row[0]
            topic_name = row[1]
            heat = row[2]
            # 打印结果
            print("排名：%d ,标题：%s ,热度：%d" % \
                  (rank, topic_name, heat))
    except:
        print("Error: unable to fetch data")
    # 关闭连接
    cursor.close()
    conn.close()

"""获取教育新闻"""
def get_edu_news(html):
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find_all('tr')
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    # 创建数据库的sql(如果数据库存在就不创建，防止异常)
    # sql_data = "CREATE DATABASE IF NOT EXISTS News"
    # 执行创建数据库的sql
    # cursor.execute(sql_data)
    cursor.execute("use News")
    # cursor.execute("DROP TABLE IF EXISTS edu_news")
    # 使用预处理语句创建表
    # sql_create = """CREATE TABLE edu_news (
    #          id INT  AUTO_INCREMENT PRIMARY KEY,
    #          titles_name  CHAR(40),
    #          topic_times   CHAR(20))"""
    # cursor.execute(sql_create)
    # print(all_topics)
    for each_topic in all_topics:
        titles = each_topic.find('td', class_="rank")  # 标题
        times = each_topic.find('td', class_="cBlue")  # 点击量
        if times != None  and titles != None:
            titles = each_topic.find('td', class_="rank").get_text()
            times = each_topic.find('td', class_="cBlue").get_text()
            # # SQL 插入语句
            val = [titles, times]
            sql_insert = "INSERT INTO edu_news(titles_name ,topic_times) VALUES(%s,%s) "
            try:
               # 执行sql语句
               # cursor.execute(sql_insert, val)
               # 提交到数据库执行
               conn.commit()
            except:
               # 如果发生错误则回滚
               conn.rollback()
    # SQL 查询语句
    sql_select = "SELECT * FROM edu_news"
    try:
       # 执行SQL语句
       cursor.execute(sql_select)
       # 获取所有记录列表
       results = cursor.fetchall()
       # print(results)
       for row in results:
          id = row[0]
          print("排名：", int(id) % 90, end="")
          titles_name = row[1]
          print("  标题：", titles_name, end="")
          topic_times = row[2]
          print("  点击量：", topic_times)

    except:
       print ("Error: unable to fetch data")
    # 关闭连接
    conn.close()

"""查询教育新闻"""
def select_edu_news():
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    cursor.execute("use News")
    # SQL 查询语句
    sql_select = "SELECT * FROM edu_news"
    try:
        # 执行SQL语句
        cursor.execute(sql_select)
        # 获取所有记录列表
        results = cursor.fetchall()
        # print(results)
        for row in results:
            id = row[0]
            print("排名：", int(id) % 90, end="")
            titles_name = row[1]
            print("  标题：", titles_name, end="")
            topic_times = row[2]
            print("  点击量：", topic_times)

    except:
        print("Error: unable to fetch data")
    # 关闭连接
    conn.close()

"""获取体育新闻"""
def get_sports_news(html):
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find_all('tr')[1:]
    for each_topic in all_topics:
        topic_times = each_topic.find('td', class_='last')  # 点击量
        topic_rank = each_topic.find('td', class_='first')  # 排名
        topic = each_topic.find('td', class_='keyword')  # 标题目
        # 创建连接
        conn = pymysql.connect("localhost", "root", "123456")
        # 创建游标
        cursor = conn.cursor()
        # 创建数据库的sql(如果数据库存在就不创建，防止异常)
        # sql_data = "CREATE DATABASE IF NOT EXISTS News"
        # 执行创建数据库的sql
        # cursor.execute(sql_data)
        cursor.execute("use News")
        # cursor.execute("DROP TABLE IF EXISTS sports_news")
        # 使用预处理语句创建表
        sql_create = """CREATE TABLE sports_news (
                            rank INT NOT NULL,
                            topic_name  CHAR(40), 
                            heat INT,
                            PRIMARY KEY (`rank`))"""
        # cursor.execute(sql_create)
        if topic_rank != None and topic != None and topic_times != None:
            topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
            topic = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
            topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
            print(topic_rank,topic,topic_times)
            # SQL 插入语句
            val = [topic_rank, topic, topic_times]
            sql_insert = "INSERT INTO sports_news(rank,topic_name,heat) VALUES(%s,%s,%s) "
            try:
               # 执行sql语句
               cursor.execute(sql_insert, val)
               # 提交到数据库执行
               conn.commit()
            except:
               # 如果发生错误则回滚
               conn.rollback()
    # SQL 查询语句
    sql_select = "SELECT * FROM sports_news"
    try:
       # 执行SQL语句
       cursor.execute(sql_select)
       # 获取所有记录列表
       results = cursor.fetchall()
       for row in results:
          rank = row[0]
          topic_name = row[1]
          heat = row[2]
           # 打印结果
          print ("排名：%d , 标题：%s , 热度：%d" % \
                 (rank, topic_name, heat,))
    except:
       print ("Error: unable to fetch data")
    # 关闭连接
    cursor.close()
    conn.close()

"""查询体育新闻"""
def select_sports_news():
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    cursor.execute("use News")
    # SQL 查询语句
    sql_select = "SELECT * FROM sports_news"
    try:
        # 执行SQL语句
        cursor.execute(sql_select)
        # 获取所有记录列表
        results = cursor.fetchall()
        for row in results:
            rank = row[0]
            topic_name = row[1]
            heat = row[2]
            # 打印结果
            print("排名：%d , 标题：%s , 热度：%d" % \
                  (rank, topic_name, heat,))
    except:
        print("Error: unable to fetch data")
    # 关闭连接
    cursor.close()
    conn.close()

"""获取网易新闻"""
def get_news(html):
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find_all('tr')
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    # 创建数据库的sql(如果数据库存在就不创建，防止异常)
    # sql_data = "CREATE DATABASE IF NOT EXISTS News"
    # 执行创建数据库的sql
    # cursor.execute(sql_data)
    cursor.execute("use News")
    # cursor.execute("DROP TABLE IF EXISTS news_163")
    # 使用预处理语句创建表
    sql_create = """CREATE TABLE news_163 (
                         id INT  AUTO_INCREMENT PRIMARY KEY,
                         titles_name  CHAR(40),
                         topic_times   CHAR(20))"""
    # cursor.execute(sql_create)
    for each_topic in all_topics:
        titles = each_topic.find('td', class_="rank")  # 标题
        times = each_topic.find('td', class_="cBlue")  # 点击量
        if times != None  and titles != None:
            titles = each_topic.find('td', class_="rank").get_text()
            times = each_topic.find('td', class_="cBlue").get_text()
            # SQL 插入语句
            val = [titles, times]
            sql_insert = "INSERT INTO news_163(titles_name ,topic_times) VALUES(%s,%s) "
            try:
               # 执行sql语句
               cursor.execute(sql_insert, val)
               # 提交到数据库执行
               conn.commit()
            except:
               # 如果发生错误则回滚
               conn.rollback()
    # SQL 查询语句
    sql_select = "SELECT * FROM news_163"
    try:
       # 执行SQL语句
       cursor.execute(sql_select)
       # 获取所有记录列表
       results = cursor.fetchall()
       # print(results)
       for row in results:
          id = row[0]
          print("排名：", int(id) % 90, end="")
          titles_name = row[1]
          print("  标题：", titles_name, end="")
          topic_times = row[2]
          print("  点击量：", topic_times)
    except:
       print ("Error: unable to fetch data")
    # 关闭连接
    conn.close()

"""查询网易新闻"""
def select_news():
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    cursor.execute("use News")
    # SQL 查询语句
    sql_select = "SELECT * FROM news_163"
    try:
        # 执行SQL语句
        cursor.execute(sql_select)
        # 获取所有记录列表
        results = cursor.fetchall()
        # print(results)
        for row in results:
            id = row[0]
            print("排名：", int(id) % 90, end="")
            titles_name = row[1]
            print("  标题：", titles_name, end="")
            topic_times = row[2]
            print("  点击量：", topic_times)
    except:
        print("Error: unable to fetch data")
    # 关闭连接
    conn.close()

""""获取腾讯国际新闻"""
def get_inter_news(html):
    soup = BeautifulSoup(html, 'html.parser')
    all_topics = soup.find('ul', class_="list")
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    # 创建数据库的sql(如果数据库存在就不创建，防止异常)
    # sql_data = "CREATE DATABASE IF NOT EXISTS News"
    # 执行创建数据库的sql
    # cursor.execute(sql_data)
    cursor.execute("use News")
    # cursor.execute("DROP TABLE IF EXISTS world_news")
    # 使用预处理语句创建表
    sql_create = """CREATE TABLE world_news (
                       id INT  AUTO_INCREMENT PRIMARY KEY,
                       titles_name  CHAR(40),
                       article_source CHAR(20),
                       topic_times   INT )"""
    # cursor.execute(sql_create)
    for each_topic in all_topics:
        titles = each_topic.find('div', class_="lazyload-placeholder")  # 标题
        source = each_topic.find('a', class_="tag")
        times = each_topic.find('a', class_="cmt")  # 点击量
        if times != None  and titles != None and source!= None:

            titles = each_topic.find('div', class_="lazyload-placeholder").get_text()
            source = each_topic.find('a', class_="tag").get_text()
            times = each_topic.find('a', class_="cmt").get_text()
            print(titles, source, times)
            # SQL 插入语句
            val = [titles, source, times]
            sql_insert = "INSERT INTO world_news(titles_name , article_source, topic_times) " \
                         "VALUES(%s,%s,%s) "
            try:
               # 执行sql语句
               cursor.execute(sql_insert, val)
               # 提交到数据库执行
               conn.commit()
            except:
               # 如果发生错误则回滚
               conn.rollback()
    # SQL 查询语句
    sql_select = "SELECT * FROM world_news"
    try:
       # 执行SQL语句
       cursor.execute(sql_select)
       # 获取所有记录列表
       results = cursor.fetchall()
       # print(results)
       for row in results:
          id = row[0]
          print("排名：", id, end="")
          titles_name = row[1]
          print("  标题：", titles_name, end="")
          article_source = row[2]
          print("  来源：", article_source, end="")
          topic_times = row[3]
          print("  点击量：", topic_times)
    except:
       print ("Error: unable to fetch data")
    # 关闭连接
    conn.close()

"""查询腾讯国际新闻"""
def select_inter_news():
    # 创建连接
    conn = pymysql.connect("localhost", "root", "123456")
    # 创建游标
    cursor = conn.cursor()
    cursor.execute("use News")
    # SQL 查询语句
    sql_select = "SELECT * FROM world_news"
    try:
        # 执行SQL语句
        cursor.execute(sql_select)
        # 获取所有记录列表
        results = cursor.fetchall()
        for row in results:
            id = row[0]
            print("排名：", id, end="")
            titles_name = row[1]
            print("  标题：", titles_name, end="")
            article_source = row[2]
            print("  来源：", article_source, end="")
            topic_times = row[3]
            print("  点击量：", topic_times)
    except:
        print("Error: unable to fetch data")
    # 关闭连接
    conn.close()


def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.163 Safari/537.36'
    }
    website = input("请输入你要获取新闻的网站(1、百度 2、网易  3、腾讯 4、历史查询)请输入序号：")
    if website == '1':
        # 百度新闻热点
        print("<--百度新闻热点模块-->")
        news_type = input("请输入你要查询的新闻类型(1、实时 2、娱乐  3、体育)输入序号：")
        if news_type == '1 ':
            url_realtime_news = 'http://top.baidu.com/buzz?b=1&fr=20811'  # 实时新闻
            html_realtime_news = get_html(url_realtime_news, headers)
            get_realtime_news(html_realtime_news)
            select_realtime_news()
        if news_type == '2 ' :
            url_pastime_news = 'http://top.baidu.com/buzz?b=344&c=513&fr=topcategory_c513'  # 娱乐新闻
            html_pastime_news = get_html(url_pastime_news, headers)
            get_realtime_news(html_pastime_news)
        if news_type == '3 ':
            url_sports_news = 'http://top.baidu.com/buzz?b=11&c=513&fr=topbuzz_b1'  # 体育新闻
            html_sports_news = get_html(url_sports_news, headers)
            get_sports_news(html_sports_news)
    if website == '2':
        # 网易新闻热点
        print("<--网易新闻热点模块-->")
        news_type = input("请输入你要查询的新闻类型(1、教育 2、实时)输入序号：")
        if news_type == '1':
            url_edu_news = 'http://news.163.com/special/0001386F/rank_edu.html'  # 教育新闻
            html_edu_news = get_html(url_edu_news, headers)
            get_edu_news(html_edu_news)
        if news_type == '2':
            url_news = 'http://news.163.com/special/0001386F/rank_news.html'  # 网易实时新闻
            html_news = get_html(url_news, headers)
            get_news(html_news)
    if website == '3':
        #腾讯新闻热点
        print("<--腾讯新闻热点模块-->")
        url_inter_news = 'https://new.qq.com/ch/world/'  # 国际新闻
        html_inter_news = get_html(url_inter_news, headers)
        get_inter_news(html_inter_news)
    if website == '4':
        print("<--历史查询模块-->")
        print("1、百度实时 2、娱乐  3、体育 4、教育 5、网易实时 6、腾讯国际")
        history_news = input("请输入你要查询的类型序号：")
        if history_news == '1':
            select_realtime_news()
        if history_news == '2':
            select_pastime_news()
        if history_news == '3':
            select_sports_news()
        if history_news == '4':
            select_edu_news()
        if history_news == '5':
            select_news()
        if history_news == '6':
            select_inter_news()
    else:
        print("更多模块有待开发！敬请期待！")

if __name__ == '__main__':
    main()
溯洄℡
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python简单

# 一.爬取三种新闻网站上某些分类下的热点新闻的相关数据利用MySQL存储。# 1）编写Python程序，使用户通过在控制台输入相关条件，进而查询到想了解的信息。import requestsfrom bs4 import BeautifulSoupimport pymysql"""获取HTML"""def get_html(url, headers): r = request...
复制链接

扫一扫
专栏目录