python爬取新闻网站内容_如何python抓取网站新闻目录下面的所有子分类及内容?...

import requests

from bs4 import BeautifulSoup

import time

chushiurl="http://www.**.cc/seojs/"

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

"Cookie": "PHPSESSID=gpkit3qd1vftnhhkorf0a31d64; Hm_lvt_e3e00d6e883c992081f3141e552754a0=1597818420; Hm_lpvt_e3e00d6e883c992081f3141e552754a0=1597830064"

}

num = 0

def get_mulu():

res = requests.get(chushiurl, headers=headers)

soup = BeautifulSoup(res.text, 'lxml')

muluurls = soup.find(class_="fesleftnav").find_all('a')

for mululink in muluurls:

link = mululink.get("href")

fan_ye(link)

def fan_ye(link):

res = requests.get(link, headers=headers)

soup = BeautifulSoup(res.text, 'lxml')

fanyes = soup.find(class_="pageRemark")

fan=fanyes.find_all("b")

yema=int(fan[0].text)

print(yema)

if yema>10:

for num in range(yema):

url=link+"/page/"+str(num+1)

get_xiang(url)

else:

get_xiang(link)

def get_xiang(link):

res=requests.get(link,headers=headers)

soup=BeautifulSoup(res.text,'lxml')

xiangurls=soup.find_all(class_="feslist_right1_l")

for url in xiangurls:

lua=url.find("a")

lul=lua.get("href")

get_neirong(lul)

def get_neirong(lul):

res = requests.get(lul, headers=headers)

soup = BeautifulSoup(res.text, 'lxml')

neirong = soup.find(class_="wangEditor-container")

biaoti = soup.find('h1')

try:

biaoti=biaoti.text

biaoti=biaoti.strip("\n")

biaoti=biaoti.strip()

neirong=neirong.text

xie_ru(biaoti,neirong)

except:

print(lul)

def xie_ru(biaoti,neirong):

with open("bb/"+biaoti+".txt","w+",encoding="utf-8") as f:

f.write(neirong)

f.close()

'''

conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="", database="saiweianquan2",

charset="utf8")

cursor_test = conn.cursor()

str1='"'+biaoti+'"'

str2='"'+neirong+'"'

global num

sql3 = 'insert into user2 (ID,biaoti,CONTENT) values (%s,%s,%s);'

sql3 = sql3 % (num,str1, str2)

try:

cursor_test.execute(sql3)

conn.commit()

except:

print("插入失败")

conn.close()

num+=1'''

if __name__ == '__main__':

get_mulu()

刚学习python爬虫知识的时候,就写了这么一个只要提供网站的新闻分类,就可以抓取其所有子分类及内容的程序,现在给大家分享一下,代码如上,爬取的效果如下图:

202009291601365891392233.jpg

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值