# 一.爬取三种新闻网站上某些分类下的热点新闻的相关数据利用MySQL存储。
# 1)编写Python程序,使用户通过在控制台输入相关条件,进而查询到想了解的信息。
import requests
from bs4 import BeautifulSoup
import pymysql
"""获取HTML"""
def get_html(url, headers):
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
return r.text
"""获取实时新闻"""
def get_realtime_news(html):
soup = BeautifulSoup(html, 'html.parser')
all_topics = soup.find_all('tr')[1:]
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
# 创建数据库的sql(如果数据库存在就不创建,防止异常)
# sql_data = "CREATE DATABASE IF NOT EXISTS News"
# 执行创建数据库的sql
# cursor.execute(sql_data)
cursor.execute("use News")
# cursor.execute("DROP TABLE IF EXISTS real_time")
# 使用预处理语句创建表
sql_create = """CREATE TABLE real_time (
rank INT NOT NULL,
topic_name CHAR(40),
heat INT,
PRIMARY KEY (`rank`))"""
# cursor.execute(sql_create)
for each_topic in all_topics:
topic_times = each_topic.find('td', class_='last') # 点击量
topic_rank = each_topic.find('td', class_='first') # 排名
topic = each_topic.find('td', class_='keyword') # 标题目
if topic_rank != None and topic != None and topic_times != None:
topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
topic = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
# SQL 插入语句
val = [topic_rank, topic, topic_times]
sql_insert = "INSERT INTO real_time(rank,topic_name,heat) VALUES(%s,%s,%s) "
try:
# 执行sql语句
cursor.execute(sql_insert, val)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
# 关闭连接
cursor.close()
conn.close()
"""查询实时新闻"""
def select_realtime_news():
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
cursor.execute("use News")
# SQL 查询语句
sql_select = "SELECT * FROM real_time"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
rank = row[0]
topic_name = row[1]
heat = row[2]
# 打印结果
print("排名:%d ,标题:%s ,热度:%d" % \
(rank, topic_name, heat,))
except:
print("Error: unable to fetch data")
# 关闭连接
cursor.close()
conn.close()
"""获取娱乐新闻"""
def get_pastime_news(html):
soup = BeautifulSoup(html, 'html.parser')
all_topics = soup.find_all('tr')[1:]
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
# 创建数据库的sql(如果数据库存在就不创建,防止异常)
# sql_data = "CREATE DATABASE IF NOT EXISTS News"
# 执行创建数据库的sql
# cursor.execute(sql_data)
cursor.execute("use News")
cursor.execute("DROP TABLE IF EXISTS pastime_news")
# 使用预处理语句创建表
sql_create = """CREATE TABLE pastime_news (
rank INT NOT NULL,
topic_name CHAR(40),
heat INT,
PRIMARY KEY (`rank`))"""
cursor.execute(sql_create)
for each_topic in all_topics:
topic_times = each_topic.find('td', class_='last') # 搜索指数
topic_rank = each_topic.find('td', class_='first') # 排名
topic = each_topic.find('td', class_='keyword') # 标题目
if topic_rank != None and topic != None and topic_times != None:
topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
topic = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
# SQL 插入语句
val = [topic_rank, topic, topic_times]
sql_insert = "INSERT INTO pastime_news(rank,topic_name,heat) VALUES(%s,%s,%s) "
try:
# 执行sql语句
cursor.execute(sql_insert, val)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
# SQL 查询语句
sql_select = "SELECT * FROM pastime_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
rank = row[0]
topic_name = row[1]
heat = row[2]
# 打印结果
print ("排名:%d ,标题:%s ,热度:%d" % \
(rank, topic_name, heat))
except:
print ("Error: unable to fetch data")
# 关闭连接
cursor.close()
conn.close()
"""查询娱乐新闻"""
def select_pastime_news():
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
cursor.execute("use News")
# SQL 查询语句
sql_select = "SELECT * FROM pastime_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
rank = row[0]
topic_name = row[1]
heat = row[2]
# 打印结果
print("排名:%d ,标题:%s ,热度:%d" % \
(rank, topic_name, heat))
except:
print("Error: unable to fetch data")
# 关闭连接
cursor.close()
conn.close()
"""获取教育新闻"""
def get_edu_news(html):
soup = BeautifulSoup(html, 'html.parser')
all_topics = soup.find_all('tr')
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
# 创建数据库的sql(如果数据库存在就不创建,防止异常)
# sql_data = "CREATE DATABASE IF NOT EXISTS News"
# 执行创建数据库的sql
# cursor.execute(sql_data)
cursor.execute("use News")
# cursor.execute("DROP TABLE IF EXISTS edu_news")
# 使用预处理语句创建表
# sql_create = """CREATE TABLE edu_news (
# id INT AUTO_INCREMENT PRIMARY KEY,
# titles_name CHAR(40),
# topic_times CHAR(20))"""
# cursor.execute(sql_create)
# print(all_topics)
for each_topic in all_topics:
titles = each_topic.find('td', class_="rank") # 标题
times = each_topic.find('td', class_="cBlue") # 点击量
if times != None and titles != None:
titles = each_topic.find('td', class_="rank").get_text()
times = each_topic.find('td', class_="cBlue").get_text()
# # SQL 插入语句
val = [titles, times]
sql_insert = "INSERT INTO edu_news(titles_name ,topic_times) VALUES(%s,%s) "
try:
# 执行sql语句
# cursor.execute(sql_insert, val)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
# SQL 查询语句
sql_select = "SELECT * FROM edu_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
for row in results:
id = row[0]
print("排名:", int(id) % 90, end="")
titles_name = row[1]
print(" 标题:", titles_name, end="")
topic_times = row[2]
print(" 点击量:", topic_times)
except:
print ("Error: unable to fetch data")
# 关闭连接
conn.close()
"""查询教育新闻"""
def select_edu_news():
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
cursor.execute("use News")
# SQL 查询语句
sql_select = "SELECT * FROM edu_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
for row in results:
id = row[0]
print("排名:", int(id) % 90, end="")
titles_name = row[1]
print(" 标题:", titles_name, end="")
topic_times = row[2]
print(" 点击量:", topic_times)
except:
print("Error: unable to fetch data")
# 关闭连接
conn.close()
"""获取体育新闻"""
def get_sports_news(html):
soup = BeautifulSoup(html, 'html.parser')
all_topics = soup.find_all('tr')[1:]
for each_topic in all_topics:
topic_times = each_topic.find('td', class_='last') # 点击量
topic_rank = each_topic.find('td', class_='first') # 排名
topic = each_topic.find('td', class_='keyword') # 标题目
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
# 创建数据库的sql(如果数据库存在就不创建,防止异常)
# sql_data = "CREATE DATABASE IF NOT EXISTS News"
# 执行创建数据库的sql
# cursor.execute(sql_data)
cursor.execute("use News")
# cursor.execute("DROP TABLE IF EXISTS sports_news")
# 使用预处理语句创建表
sql_create = """CREATE TABLE sports_news (
rank INT NOT NULL,
topic_name CHAR(40),
heat INT,
PRIMARY KEY (`rank`))"""
# cursor.execute(sql_create)
if topic_rank != None and topic != None and topic_times != None:
topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
topic = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
print(topic_rank,topic,topic_times)
# SQL 插入语句
val = [topic_rank, topic, topic_times]
sql_insert = "INSERT INTO sports_news(rank,topic_name,heat) VALUES(%s,%s,%s) "
try:
# 执行sql语句
cursor.execute(sql_insert, val)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
# SQL 查询语句
sql_select = "SELECT * FROM sports_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
rank = row[0]
topic_name = row[1]
heat = row[2]
# 打印结果
print ("排名:%d , 标题:%s , 热度:%d" % \
(rank, topic_name, heat,))
except:
print ("Error: unable to fetch data")
# 关闭连接
cursor.close()
conn.close()
"""查询体育新闻"""
def select_sports_news():
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
cursor.execute("use News")
# SQL 查询语句
sql_select = "SELECT * FROM sports_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
rank = row[0]
topic_name = row[1]
heat = row[2]
# 打印结果
print("排名:%d , 标题:%s , 热度:%d" % \
(rank, topic_name, heat,))
except:
print("Error: unable to fetch data")
# 关闭连接
cursor.close()
conn.close()
"""获取网易新闻"""
def get_news(html):
soup = BeautifulSoup(html, 'html.parser')
all_topics = soup.find_all('tr')
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
# 创建数据库的sql(如果数据库存在就不创建,防止异常)
# sql_data = "CREATE DATABASE IF NOT EXISTS News"
# 执行创建数据库的sql
# cursor.execute(sql_data)
cursor.execute("use News")
# cursor.execute("DROP TABLE IF EXISTS news_163")
# 使用预处理语句创建表
sql_create = """CREATE TABLE news_163 (
id INT AUTO_INCREMENT PRIMARY KEY,
titles_name CHAR(40),
topic_times CHAR(20))"""
# cursor.execute(sql_create)
for each_topic in all_topics:
titles = each_topic.find('td', class_="rank") # 标题
times = each_topic.find('td', class_="cBlue") # 点击量
if times != None and titles != None:
titles = each_topic.find('td', class_="rank").get_text()
times = each_topic.find('td', class_="cBlue").get_text()
# SQL 插入语句
val = [titles, times]
sql_insert = "INSERT INTO news_163(titles_name ,topic_times) VALUES(%s,%s) "
try:
# 执行sql语句
cursor.execute(sql_insert, val)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
# SQL 查询语句
sql_select = "SELECT * FROM news_163"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
for row in results:
id = row[0]
print("排名:", int(id) % 90, end="")
titles_name = row[1]
print(" 标题:", titles_name, end="")
topic_times = row[2]
print(" 点击量:", topic_times)
except:
print ("Error: unable to fetch data")
# 关闭连接
conn.close()
"""查询网易新闻"""
def select_news():
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
cursor.execute("use News")
# SQL 查询语句
sql_select = "SELECT * FROM news_163"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
for row in results:
id = row[0]
print("排名:", int(id) % 90, end="")
titles_name = row[1]
print(" 标题:", titles_name, end="")
topic_times = row[2]
print(" 点击量:", topic_times)
except:
print("Error: unable to fetch data")
# 关闭连接
conn.close()
""""获取腾讯国际新闻"""
def get_inter_news(html):
soup = BeautifulSoup(html, 'html.parser')
all_topics = soup.find('ul', class_="list")
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
# 创建数据库的sql(如果数据库存在就不创建,防止异常)
# sql_data = "CREATE DATABASE IF NOT EXISTS News"
# 执行创建数据库的sql
# cursor.execute(sql_data)
cursor.execute("use News")
# cursor.execute("DROP TABLE IF EXISTS world_news")
# 使用预处理语句创建表
sql_create = """CREATE TABLE world_news (
id INT AUTO_INCREMENT PRIMARY KEY,
titles_name CHAR(40),
article_source CHAR(20),
topic_times INT )"""
# cursor.execute(sql_create)
for each_topic in all_topics:
titles = each_topic.find('div', class_="lazyload-placeholder") # 标题
source = each_topic.find('a', class_="tag")
times = each_topic.find('a', class_="cmt") # 点击量
if times != None and titles != None and source!= None:
titles = each_topic.find('div', class_="lazyload-placeholder").get_text()
source = each_topic.find('a', class_="tag").get_text()
times = each_topic.find('a', class_="cmt").get_text()
print(titles, source, times)
# SQL 插入语句
val = [titles, source, times]
sql_insert = "INSERT INTO world_news(titles_name , article_source, topic_times) " \
"VALUES(%s,%s,%s) "
try:
# 执行sql语句
cursor.execute(sql_insert, val)
# 提交到数据库执行
conn.commit()
except:
# 如果发生错误则回滚
conn.rollback()
# SQL 查询语句
sql_select = "SELECT * FROM world_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
for row in results:
id = row[0]
print("排名:", id, end="")
titles_name = row[1]
print(" 标题:", titles_name, end="")
article_source = row[2]
print(" 来源:", article_source, end="")
topic_times = row[3]
print(" 点击量:", topic_times)
except:
print ("Error: unable to fetch data")
# 关闭连接
conn.close()
"""查询腾讯国际新闻"""
def select_inter_news():
# 创建连接
conn = pymysql.connect("localhost", "root", "123456")
# 创建游标
cursor = conn.cursor()
cursor.execute("use News")
# SQL 查询语句
sql_select = "SELECT * FROM world_news"
try:
# 执行SQL语句
cursor.execute(sql_select)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
id = row[0]
print("排名:", id, end="")
titles_name = row[1]
print(" 标题:", titles_name, end="")
article_source = row[2]
print(" 来源:", article_source, end="")
topic_times = row[3]
print(" 点击量:", topic_times)
except:
print("Error: unable to fetch data")
# 关闭连接
conn.close()
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.163 Safari/537.36'
}
website = input("请输入你要获取新闻的网站(1、百度 2、网易 3、腾讯 4、历史查询)请输入序号:")
if website == '1':
# 百度新闻热点
print("<--百度新闻热点模块-->")
news_type = input("请输入你要查询的新闻类型(1、实时 2、娱乐 3、体育)输入序号:")
if news_type == '1 ':
url_realtime_news = 'http://top.baidu.com/buzz?b=1&fr=20811' # 实时新闻
html_realtime_news = get_html(url_realtime_news, headers)
get_realtime_news(html_realtime_news)
select_realtime_news()
if news_type == '2 ' :
url_pastime_news = 'http://top.baidu.com/buzz?b=344&c=513&fr=topcategory_c513' # 娱乐新闻
html_pastime_news = get_html(url_pastime_news, headers)
get_realtime_news(html_pastime_news)
if news_type == '3 ':
url_sports_news = 'http://top.baidu.com/buzz?b=11&c=513&fr=topbuzz_b1' # 体育新闻
html_sports_news = get_html(url_sports_news, headers)
get_sports_news(html_sports_news)
if website == '2':
# 网易新闻热点
print("<--网易新闻热点模块-->")
news_type = input("请输入你要查询的新闻类型(1、教育 2、实时)输入序号:")
if news_type == '1':
url_edu_news = 'http://news.163.com/special/0001386F/rank_edu.html' # 教育新闻
html_edu_news = get_html(url_edu_news, headers)
get_edu_news(html_edu_news)
if news_type == '2':
url_news = 'http://news.163.com/special/0001386F/rank_news.html' # 网易实时新闻
html_news = get_html(url_news, headers)
get_news(html_news)
if website == '3':
#腾讯新闻热点
print("<--腾讯新闻热点模块-->")
url_inter_news = 'https://new.qq.com/ch/world/' # 国际新闻
html_inter_news = get_html(url_inter_news, headers)
get_inter_news(html_inter_news)
if website == '4':
print("<--历史查询模块-->")
print("1、百度实时 2、娱乐 3、体育 4、教育 5、网易实时 6、腾讯国际")
history_news = input("请输入你要查询的类型序号:")
if history_news == '1':
select_realtime_news()
if history_news == '2':
select_pastime_news()
if history_news == '3':
select_sports_news()
if history_news == '4':
select_edu_news()
if history_news == '5':
select_news()
if history_news == '6':
select_inter_news()
else:
print("更多模块有待开发!敬请期待!")
if __name__ == '__main__':
main()
09-22
510
02-03
99
07-02
09-15
164