python抓取网络文章

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import urllib.request
from requests.exceptions import RequestException
import csv
import pandas as pd


import random


def getUrl():
    data = []
    res = requests.get('https://xxx.com/')#获取目标网页
    res.encoding = 'utf-8'#抓取网页出现乱码

    soup = BeautifulSoup(res.text,'html.parser')#爬取网页

    for news in soup.select('#list li'):
        m_url = 'https://xxx.org'+news.find('a').get('href')
        data.append(m_url)
        #print(data)
    return data

urls = getUrl();

# 获取页面内容
def getHtml(url):
    try:
        response = requests.get(url)

        if response.status_code == 200:
            return response.text
    except RequestException:
        print('===request exception===')
        return None

# 解析网页
def parse_html(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.select('#entry'):

            title = tag.find('h1').get_text()

            for art in tag.select('#entrybody'):

                #a = art.find('div',id='fengxibutton')
                #b = art.find('div',id='fenxi')
                #a.decompose() 去除指定标签节点
                #b.decompose()
                content = art.get_text()

        return title,content
    except Exception:
        print('===parseHtml exception===')
        return None

# 保存到csv表中
def save2csv(title, content):
    with open('xx.csv', 'a+', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['title', 'content'])
        writer.writerow([title, content])
        pd.read_csv('xx.csv')

def article():
    for url in urls:

        html = getHtml(url)

        info = parse_html(html)

        if info==None:
            title = url
            content = url
        else:
            (title, content) = info

        save2csv(title, content)


article()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值