python爬取网站新闻_爬取网站新闻

最新推荐文章于 2024-08-03 15:58:00 发布

weixin_39800971

最新推荐文章于 2024-08-03 15:58:00 发布

阅读量222

点赞数

文章标签： python爬取网站新闻

# -*- coding:utf-8 -*-

import json

import time

import requests

session = requests.session()

import logging.handlers

import pickle

import sys

import re

import datetime

from bs4 import BeautifulSoup

import importlib

import csv

import string

import zhon.hanzi

import sys

importlib.reload(sys)

punce = string.punctuation

puncz = zhon.hanzi.punctuation

f = open('0906/0906electron原文.csv','w+',encoding='utf-8',newline='')

fp = open('0906/0906electron段落.csv','w+',encoding='utf-8',newline='')

csv_article = csv.writer(f)

csv_para = csv.writer(fp)

allparas = []

def getNewsDetail(newsurl):

news_p=[]

p1=''

# res = requests.get(newsurl)

# res.encoding = 'utf-8'

result = session.get(url=newsurl)

soup = BeautifulSoup(result.text,'html.parser')

# news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])

for p in soup.select('.newsContent p'):

p1 = p1 + p.text.replace('\n','')

if len(p1) >= 200 and len(p1) <= 500 :

news_p.append(p1)

p1 = ''

news_article = ' '.join([p.text.strip().replace('\n','') for p in soup.select('.newsContent p')])

return news_article, news_p

def spider():

# pages = [57918,57919,234399,234400]

# http://www.dsti.net/Information/HyeList/aviation/ 0-487页

# http://www.dsti.net/Information/HyeList/spaceflight 0-48页

# http://www.dsti.net/Information/HyeList/electron/ 1,30

for page in range(1,30):

#组合url

# url = "http://mil.gmw.cn/node_8979"+onepage+".htm"

# url = "http://www.dsti.net/Information/HyeList/spaceflight/" + str(page)

url = "http://www.dsti.net/Information/HyeList/electron/" + str(page)

print(url)

# 伪装请求头

headers = {

'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

}

result = session.get(url=url,headers=headers).content

# soup = BeautifulSoup(result,'html.parser').encode('GBK','ignore').decode('GBK')

soup = BeautifulSoup(result, 'html.parser',from_encoding="gb18030")

if soup is None:

break

# req = requests.get(headers=headers, url=url)

# content = req.content

# soup = content.decode('gbk')

# res = requests.get(url=url, headers=headers)

# res.encoding = 'gb18030'

# soup = BeautifulSoup(res.text, 'html.parser')

#找到新闻列表

result_div = soup.find('div', attrs={'class': 'listMidContent'}).find('ul')

# result_div = result_div.encode('GBK','ignore').decode('GBK')

#去下换行

result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')

#正则匹配信息

result_list = re.findall('

.(.*?)

',result_replace)

for i in result_list:

# http://www.dsti.net/Information/News/120652

news_url = 'http://www.dsti.net/' + re.findall('href="(.*?)" target="_blank">',i)[0]

news_name = re.findall('target="_blank">(.*?)',i)[0]

# news_time = re.findall('\((.*?)\)',i)[0]

# 标题-段落

news_article, news_p = getNewsDetail(news_url)

for p1 in news_p:

if p1!='':

csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])

# 标题-原文

if news_article!='':

csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])

# time.sleep(1)

spider()

f.close()

weixin_39800971

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫