Python爬虫入门案例

免责声明:此案例为Python入门静态网页新闻爬取,未对目标网站进行大批量访问,不承担任何责任。

写入数据库版本:

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sqlalchemy
import pymysql
pymysql_engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost/sampledb?charset=utf8') #设定数据库

# all_page_news = []
root_url = 'http://www.ftchinese.com/channel/china.html'
for i in range(1,4):  #限定爬去前三页
    url = root_url + "?page={}".format(i)
    print("抓取页面: ",url)
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "lxml")

    items = soup.find("div",class_="items")
    one_page_news = []
    for i in items("div",class_="item-inner")[:4]:  #限定爬去每页前四条
        title = i.h2.a.string  #新闻标题
        tmp_url = "http://www.ftchinese.com" + i.h2.a["href"].split("#")[0] + "?full=y"  #对应新闻全文超链接,去除广告
        print("抓取新闻: ",tmp_url)
        tmp_soup = BeautifulSoup(requests.get(tmp_url).text, "lxml")
        try:
            text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)  #新闻正文,对非文本粗略判定为图片新闻
        except AttributeError:
            text = "图片新闻"
        lead = i.find("div",class_="item-lead", recursive=False).string  #新闻摘要
        one_news = [title, lead, tmp_url, text]
        one_page_news.append(one_news)

    # 写入数据库
    print("To Database...")  
    data = np.array(one_page_news)
    df = pd.DataFrame(data, columns=['title', 'lead', 'tmp_url', 'text'])
    df.to_sql("ftnews",pymysql_engine,if_exists="append")


# data = np.array(all_page_news) 


Json版本:

import requests
from bs4 import BeautifulSoup
import json

all_page_news = []
root_url = 'http://www.ftchinese.com/channel/china.html'
for i in range(1,4):
    url = root_url + "?page={}".format(i)
    print("抓取页面: ",url)
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "lxml")

    items = soup.find("div",class_="items")
    one_page_news = []
    for i in items("div",class_="item-inner")[:4]: 
        title = i.h2.a.string
        tmp_url = "http://www.ftchinese.com" + i.h2.a["href"].split("#")[0] + "?full=y"
        print("抓取新闻: ",tmp_url)
        tmp_soup = BeautifulSoup(requests.get(tmp_url).text, "lxml")
        try:
            text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)
        except AttributeError:
            text = "图片新闻"
        lead = i.find("div",class_="item-lead", recursive=False).string
        one_news = [title, lead, tmp_url, text]
        one_page_news.append(one_news)
    all_page_news.extend(one_page_news)
# 写入json文件  
with open("ftnews.json",'w',encoding="utf8") as file:
    json.dump(all_page_news,file)  

# 读取
# with open("ftnews.json",'r',encoding="utf8") as file:
#     data_in = json.load(file)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值