Python爬虫入门案例

最新推荐文章于 2024-08-13 18:54:10 发布

妄念驱动

最新推荐文章于 2024-08-13 18:54:10 发布

阅读量359

点赞数

分类专栏： python 文章标签： python 爬虫数据库

本文链接：https://blog.csdn.net/hx2017/article/details/77871989

版权

python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

免责声明：此案例为Python入门静态网页新闻爬取，未对目标网站进行大批量访问，不承担任何责任。

写入数据库版本：

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sqlalchemy
import pymysql
pymysql_engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost/sampledb?charset=utf8') #设定数据库

# all_page_news = []
root_url = 'http://www.ftchinese.com/channel/china.html'
for i in range(1,4):  #限定爬去前三页
    url = root_url + "?page={}".format(i)
    print("抓取页面: ",url)
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "lxml")

    items = soup.find("div",class_="items")
    one_page_news = []
    for i in items("div",class_="item-inner")[:4]:  #限定爬去每页前四条
        title = i.h2.a.string  #新闻标题
        tmp_url = "http://www.ftchinese.com" + i.h2.a["href"].split("#")[0] + "?full=y"  #对应新闻全文超链接，去除广告
        print("抓取新闻: ",tmp_url)
        tmp_soup = BeautifulSoup(requests.get(tmp_url).text, "lxml")
        try:
            text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)  #新闻正文，对非文本粗略判定为图片新闻
        except AttributeError:
            text = "图片新闻"
        lead = i.find("div",class_="item-lead", recursive=False).string  #新闻摘要
        one_news = [title, lead, tmp_url, text]
        one_page_news.append(one_news)

    # 写入数据库
    print("To Database...")  
    data = np.array(one_page_news)
    df = pd.DataFrame(data, columns=['title', 'lead', 'tmp_url', 'text'])
    df.to_sql("ftnews",pymysql_engine,if_exists="append")


# data = np.array(all_page_news)

Json版本：

import requests
from bs4 import BeautifulSoup
import json

all_page_news = []
root_url = 'http://www.ftchinese.com/channel/china.html'
for i in range(1,4):
    url = root_url + "?page={}".format(i)
    print("抓取页面: ",url)
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "lxml")

    items = soup.find("div",class_="items")
    one_page_news = []
    for i in items("div",class_="item-inner")[:4]: 
        title = i.h2.a.string
        tmp_url = "http://www.ftchinese.com" + i.h2.a["href"].split("#")[0] + "?full=y"
        print("抓取新闻: ",tmp_url)
        tmp_soup = BeautifulSoup(requests.get(tmp_url).text, "lxml")
        try:
            text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)
        except AttributeError:
            text = "图片新闻"
        lead = i.find("div",class_="item-lead", recursive=False).string
        one_news = [title, lead, tmp_url, text]
        one_page_news.append(one_news)
    all_page_news.extend(one_page_news)
# 写入json文件  
with open("ftnews.json",'w',encoding="utf8") as file:
    json.dump(all_page_news,file)  

# 读取
# with open("ftnews.json",'r',encoding="utf8") as file:
#     data_in = json.load(file)