Python爬虫实战(动态网页)

上一篇介绍了静态网页的爬虫,这次我们试试动态网页怎么爬取。

#指定浏览器位置
chrm = R"F:\Python\chromedriver_win32\chromedriver.exe"

#爬取摘要、url等信息写入数据库

import sqlite3
#创建数据库
db = R"E:\TencentNews.db"

with sqlite3.connect(db) as conn:
    sql = (
        "Create Table If Not Exists News( "
        "id INTEGER PRIMARY KEY NOT NULL, "
        "url varchar(100), "
        "title varchar(100), "
        "theme varchar(100), "
        "date date, "
        "time time)")
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()

with sqlite3.connect(db) as conn:
    sql = (
        "Create Table If Not Exists NewsBody( "
        "id INTEGER PRIMARY KEY NOT NULL, "
        "text text, "
        "FOREIGN KEY(id) REFERENCES News(id))")
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()

###--------------------------------------
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import datetime
import sqlite3
import requests



def start_crawler(start_date, end_date):
    # 如果驱动被添加到系统路径中则可以不传入参数使用
    driver = webdriver.Chrome(chrm)
    driver.get("http://roll.news.qq.com/")
    for date in pd.date_range(start_date, end_date):
        # 切换浏览器新闻日期
        driver = to_date(driver, date)
        print("抓取{}天新闻".format(date.day))
        exist_url = get_exist(date)
        while True:
            # 抓取当前页面内容
            one_page = get_page_news(driver, date, exist_url)
            # 写入数据库
            to_db(one_page)

            # 跳转到下一页
            try:
                driver.find_element_by_xpath(
                    '//div[@id="pageArea"]//a[text()="下一页>"]').click()
                print("进行下一页的抓取")
                time.sleep(1)  #设置速度,给服务器响应时间
            except NoSuchElementException:
                print("当前天{}抓取完毕".format(date))
                break
    print("正常抓取完毕,关闭浏览器!")
    driver.quit()


def to_date(driver, date):
    # 获取当前日期
    soup = BeautifulSoup(driver.page_source, "lxml")
    ym_s = soup.find("td", id="_CalendarYear_").h3.string
    ym = datetime.datetime.strptime(ym_s, "%Y年%m月")
    p_year, p_month = ym.year, ym.month  # 获取到页面当前的年和月
    # 求页面和目标时间的年月差距
    diff_year = date.year - p_year
    diff_month = date.month - p_month
    # 定位到年月切换按钮
    last_year = driver.find_element_by_xpath('//td[@title="上一年"]')
    last_month = driver.find_element_by_xpath('//td[@title="上一月"]')
    next_month = driver.find_element_by_xpath('//td[@title="下一月"]')
    next_year = driver.find_element_by_xpath('//td[@title="下一年"]')
    # 年月调整
    if diff_year >= 0:
        for i in range(diff_year):
            next_year.click()
            time.sleep(0.1)
    else:
        for i in range(-diff_year):
            last_year.click()
            time.sleep(0.1)
            # 月
    if diff_month >= 0:
        for i in range(diff_month):
            next_month.click()
            time.sleep(0.1)
    else:
        for i in range(-diff_month):
            last_month.click()
            time.sleep(0.1)
            # 调整天
    driver.find_element_by_xpath(
        '//tbody//a[text()={}]'.format(date.day)).click()
    time.sleep(1)
    return driver


def get_exist(date):
    db = 'E:\\TencentNews.db'
    with sqlite3.connect(db) as conn:
        sql = (
            "SELECT url FROM News where "
            "`date`='{}'".format(datetime.datetime.strftime(date, "%Y-%m-%d")))

        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        result = cursor.fetchall()
    return set(*zip(*result))  #提取result中的url,用set去重


def get_page_news(driver, date, exist_url):
    soup = BeautifulSoup(driver.page_source, "lxml")
    one_page = []
    for i in soup.find("div", class_="list c")("li"):
        if i.a["href"] in exist_url:
            continue
        one_page.append(
            (None,
             i.a["href"],
             i.a.string,
             i.find("span", class_="t-tit").string[1:-1],
             datetime.datetime.strftime(date, "%Y-%m-%d"),
             i.find("span", class_="t-time").string.split()[1]))
    return one_page


def to_db(data):
    db = 'E:\\TencentNews.db'
    with sqlite3.connect(db) as conn:
        sql = (
            "INSERT INTO News "
            "(id,url,title,theme,date,time) "
            "VALUES (?, ?, ?, ?, ?, ?)")
        cursor = conn.cursor()
        cursor.executemany(sql, data)
        conn.commit()


# 启动爬虫
start_crawler("2017-06-01", "2017-06-05")

# 使用pandas查询结果
import sqlalchemy
import pandas as pd

sqlite_engine = sqlalchemy.create_engine('sqlite:///E:/TencentNews.db',
                                         encoding='utf-8')


df = pd.read_sql("SELECT * FROM News limit 1", sqlite_engine)
df.date.values

抓取1天新闻
进行下一页的抓取

当前天2017-06-05 00:00:00抓取完毕
正常抓取完毕,关闭浏览器!

##
# 第二部分, 爬取数据库中的链接的具体新闻

def get_news_body(start_date, end_date):
    for date in pd.date_range(start_date, end_date):  # 对目标的时间进行逐个迭代
        link_list = get_news_linksfrom_database(date)
        print("INFO: Crawling Date {} News Body Now...".format(date))
        for linkid, url in link_list:
            news_body = get_news_text(url)
            # 写入数据库
            writer_news_body_to_database(linkid, news_body)

    print("抓取时间段:{}到{}新闻主体完毕!".format(start_date, end_date))

def get_news_text(url):
    html = requests.get(url)
    html.encoding = html.apparent_encoding
    soup = BeautifulSoup(html.text, 'html.parser')
    try:
        return soup.find("div", {"id": "Cnt-Main-Article-QQ"}).text
    except:
        return None


def writer_news_body_to_database(linkid, news_body):
    print("INFO: Writing News ID:{} To Database...".format(linkid))
    sql = (
        "INSERT INTO newsbody (id, text) "
        "VALUES (?, ?)")
    db = 'E:\\TencentNews.db'
    with sqlite3.connect(db) as conn:
        cursor = conn.cursor()
        cursor.execute(sql, (linkid, news_body))
        conn.commit()


def get_news_linksfrom_database(date):
    db = 'E:\\TencentNews.db'

    sql = (
    "SELECT news.id, news.url "
    "FROM news LEFT JOIN newsbody "
    "ON news.id = newsbody.id WHERE "
    "news.date = '{}' AND newsbody.id "
    "IS NULL;".format(datetime.datetime.strftime(date, '%Y-%m-%d')))

    with sqlite3.connect(db) as conn:
        cursor = conn.cursor()
        cursor.execute(sql)
        result = cursor.fetchall()
    return result if result else []

get_news_body("2017-06-01", "2017-06-02")

INFO: Crawling Date 2017-06-01 00:00:00 News Body Now…
INFO: Writing News ID:1 To Database…
INFO: Writing News ID:2 To Database…

INFO: Crawling Date 2017-06-02 00:00:00 News Body Now…
抓取时间段:2017-06-01到2017-06-02新闻主体完毕!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值