上一篇介绍了静态网页的爬虫,这次我们试试动态网页怎么爬取。
#指定浏览器位置
chrm = R"F:\Python\chromedriver_win32\chromedriver.exe"
#爬取摘要、url等信息写入数据库
import sqlite3
#创建数据库
db = R"E:\TencentNews.db"
with sqlite3.connect(db) as conn:
sql = (
"Create Table If Not Exists News( "
"id INTEGER PRIMARY KEY NOT NULL, "
"url varchar(100), "
"title varchar(100), "
"theme varchar(100), "
"date date, "
"time time)")
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
with sqlite3.connect(db) as conn:
sql = (
"Create Table If Not Exists NewsBody( "
"id INTEGER PRIMARY KEY NOT NULL, "
"text text, "
"FOREIGN KEY(id) REFERENCES News(id))")
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
###--------------------------------------
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import datetime
import sqlite3
import requests
def start_crawler(start_date, end_date):
# 如果驱动被添加到系统路径中则可以不传入参数使用
driver = webdriver.Chrome(chrm)
driver.get("http://roll.news.qq.com/")
for date in pd.date_range(start_date, end_date):
# 切换浏览器新闻日期
driver = to_date(driver, date)
print("抓取{}天新闻".format(date.day))
exist_url = get_exist(date)
while True:
# 抓取当前页面内容
one_page = get_page_news(driver, date, exist_url)
# 写入数据库
to_db(one_page)
# 跳转到下一页
try:
driver.find_element_by_xpath(
'//div[@id="pageArea"]//a[text()="下一页>"]').click()
print("进行下一页的抓取")
time.sleep(1) #设置速度,给服务器响应时间
except NoSuchElementException:
print("当前天{}抓取完毕".format(date))
break
print("正常抓取完毕,关闭浏览器!")
driver.quit()
def to_date(driver, date):
# 获取当前日期
soup = BeautifulSoup(driver.page_source, "lxml")
ym_s = soup.find("td", id="_CalendarYear_").h3.string
ym = datetime.datetime.strptime(ym_s, "%Y年%m月")
p_year, p_month = ym.year, ym.month # 获取到页面当前的年和月
# 求页面和目标时间的年月差距
diff_year = date.year - p_year
diff_month = date.month - p_month
# 定位到年月切换按钮
last_year = driver.find_element_by_xpath('//td[@title="上一年"]')
last_month = driver.find_element_by_xpath('//td[@title="上一月"]')
next_month = driver.find_element_by_xpath('//td[@title="下一月"]')
next_year = driver.find_element_by_xpath('//td[@title="下一年"]')
# 年月调整
if diff_year >= 0:
for i in range(diff_year):
next_year.click()
time.sleep(0.1)
else:
for i in range(-diff_year):
last_year.click()
time.sleep(0.1)
# 月
if diff_month >= 0:
for i in range(diff_month):
next_month.click()
time.sleep(0.1)
else:
for i in range(-diff_month):
last_month.click()
time.sleep(0.1)
# 调整天
driver.find_element_by_xpath(
'//tbody//a[text()={}]'.format(date.day)).click()
time.sleep(1)
return driver
def get_exist(date):
db = 'E:\\TencentNews.db'
with sqlite3.connect(db) as conn:
sql = (
"SELECT url FROM News where "
"`date`='{}'".format(datetime.datetime.strftime(date, "%Y-%m-%d")))
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()
return set(*zip(*result)) #提取result中的url,用set去重
def get_page_news(driver, date, exist_url):
soup = BeautifulSoup(driver.page_source, "lxml")
one_page = []
for i in soup.find("div", class_="list c")("li"):
if i.a["href"] in exist_url:
continue
one_page.append(
(None,
i.a["href"],
i.a.string,
i.find("span", class_="t-tit").string[1:-1],
datetime.datetime.strftime(date, "%Y-%m-%d"),
i.find("span", class_="t-time").string.split()[1]))
return one_page
def to_db(data):
db = 'E:\\TencentNews.db'
with sqlite3.connect(db) as conn:
sql = (
"INSERT INTO News "
"(id,url,title,theme,date,time) "
"VALUES (?, ?, ?, ?, ?, ?)")
cursor = conn.cursor()
cursor.executemany(sql, data)
conn.commit()
# 启动爬虫
start_crawler("2017-06-01", "2017-06-05")
# 使用pandas查询结果
import sqlalchemy
import pandas as pd
sqlite_engine = sqlalchemy.create_engine('sqlite:///E:/TencentNews.db',
encoding='utf-8')
df = pd.read_sql("SELECT * FROM News limit 1", sqlite_engine)
df.date.values
抓取1天新闻
进行下一页的抓取
…
当前天2017-06-05 00:00:00抓取完毕
正常抓取完毕,关闭浏览器!
##
# 第二部分, 爬取数据库中的链接的具体新闻
def get_news_body(start_date, end_date):
for date in pd.date_range(start_date, end_date): # 对目标的时间进行逐个迭代
link_list = get_news_linksfrom_database(date)
print("INFO: Crawling Date {} News Body Now...".format(date))
for linkid, url in link_list:
news_body = get_news_text(url)
# 写入数据库
writer_news_body_to_database(linkid, news_body)
print("抓取时间段:{}到{}新闻主体完毕!".format(start_date, end_date))
def get_news_text(url):
html = requests.get(url)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text, 'html.parser')
try:
return soup.find("div", {"id": "Cnt-Main-Article-QQ"}).text
except:
return None
def writer_news_body_to_database(linkid, news_body):
print("INFO: Writing News ID:{} To Database...".format(linkid))
sql = (
"INSERT INTO newsbody (id, text) "
"VALUES (?, ?)")
db = 'E:\\TencentNews.db'
with sqlite3.connect(db) as conn:
cursor = conn.cursor()
cursor.execute(sql, (linkid, news_body))
conn.commit()
def get_news_linksfrom_database(date):
db = 'E:\\TencentNews.db'
sql = (
"SELECT news.id, news.url "
"FROM news LEFT JOIN newsbody "
"ON news.id = newsbody.id WHERE "
"news.date = '{}' AND newsbody.id "
"IS NULL;".format(datetime.datetime.strftime(date, '%Y-%m-%d')))
with sqlite3.connect(db) as conn:
cursor = conn.cursor()
cursor.execute(sql)
result = cursor.fetchall()
return result if result else []
get_news_body("2017-06-01", "2017-06-02")
INFO: Crawling Date 2017-06-01 00:00:00 News Body Now…
INFO: Writing News ID:1 To Database…
INFO: Writing News ID:2 To Database…
…
INFO: Crawling Date 2017-06-02 00:00:00 News Body Now…
抓取时间段:2017-06-01到2017-06-02新闻主体完毕!