# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import urllib.request
from requests.exceptions import RequestException
import csv
import pandas as pd
import random
def getUrl():
data = []
res = requests.get('https://xxx.com/')#获取目标网页
res.encoding = 'utf-8'#抓取网页出现乱码
soup = BeautifulSoup(res.text,'html.parser')#爬取网页
for news in soup.select('#list li'):
m_url = 'https://xxx.org'+news.find('a').get('href')
data.append(m_url)
#print(data)
return data
urls = getUrl();
# 获取页面内容
def getHtml(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except RequestException:
print('===request exception===')
return None
# 解析网页
def parse_html(html):
try:
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.select('#entry'):
title = tag.find('h1').get_text()
for art in tag.select('#entrybody'):
#a = art.find('div',id='fengxibutton')
#b = art.find('div',id='fenxi')
#a.decompose() 去除指定标签节点
#b.decompose()
content = art.get_text()
return title,content
except Exception:
print('===parseHtml exception===')
return None
# 保存到csv表中
def save2csv(title, content):
with open('xx.csv', 'a+', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['title', 'content'])
writer.writerow([title, content])
pd.read_csv('xx.csv')
def article():
for url in urls:
html = getHtml(url)
info = parse_html(html)
if info==None:
title = url
content = url
else:
(title, content) = info
save2csv(title, content)
article()