python12.26浙江高职院校新闻爬取实战

import requests
from bs4 import BeautifulSoup

url = 'http://www.zjipc.com/434/list.htm'
data = requests.get(url)
data.encoding = 'utf-8'
bs=BeautifulSoup(data.text,'lxml')
title=bs.select(".list_news a")
ls=bs.select(".list_bt a")
print(len(title))
count=""
for i in title:
    print("标题:"+i.getText())
for i in ls:
    print("链接:"+"http://www.zjipc.com"+i.get('href'))
for i in ls:
    urls="http://www.zjipc.com"+i.get('href')
    data2=requests.get(urls)
    data2.encoding="utf-8"
    bs2=BeautifulSoup(data2.text,'lxml')
    title2=bs2.select(".bt")
    time=bs2.select("span[frag='窗口113']")
    zuozhe=bs2.select("span[frag='窗口112']")
    print("##################################"+i.getText()+"#######################################")
    print("标题:")
    print(title2[0].getText())
    print("作者:")
    print(zuozhe[0].getText())
    print("时间:")
    print(time[0].getText())
    zw=bs2.select(".Article_Content")
    print("正文:")
    #print(type(zw))
    print(zw[0].getText())
    tl=title2[0].getText()
    zz=zuozhe[0].getText()
    sj=time[0].getText()
    zw=zw[0].getText()
    ss=tl+"\n"+zz+"\n"+sj+"\n"+urls+"\n"+zw+"\n\n"
    count=count+ss
with open("data.txt","w",encoding="utf-8") as f:
    f.write(count)


88888888888888888888888888888888888888888888888888888888888888888888


import  requests
from  bs4 import BeautifulSoup
urls="

http://www.wzvtc.cn/list/21.html"
html=requests.get(urls)
html.encoding="utf-8"
bs=BeautifulSoup(html.text,'lxml')
ls=bs.select(".page_list_title")
print(ls[0].getText())
hs=bs.select(".page_list_title a")
count=""
for i in range(1,len(hs),1):
    htmls="

http://www.wzvtc.cn"+hs[i].get('href')
    print(htmls)
    url=htmls
    htm11=requests.get(url)
    htm11.encoding="utf-8"
    br=BeautifulSoup(htm11.text,'lxml')
    bt=br.select("#ShowArticle_title")
    print(len(bt))
    sj=br.select("#ShowArticle_type")
    zw=br.select("#ShowArticle_Content")
    print("=======标题=======")
    print(bt[0].string)
    print("=======时间=======")
    print(sj[0].getText())
    print("=======正文=======")
    print(zw[0].getText())
    ss=bt[0].string+"\n"+sj[0].getText()+"\n"+zw[0].getText()+"\n"+"==========================="+"\n"
    count=count+ss
with open("big.txt","w",encoding="utf-8") as f:
    f.write(count)



8888888888888888888888888888888888888888888888888888888888888


import  requests
from  bs4` import BeautifulSoup

urls="http://www.zjiet.edu.cn/108/list.htm"
html=requests.get(urls)
html.encoding="utf-8"
bs=BeautifulSoup(html.text,'lxml')
ls=bs.select(".tongzhinrx")
print(ls[0].getText())
hs=bs.select(".tongzhinrx a")
count=""
for i in range(1,len(hs),2):
    htmls="http://www.zjiet.edu.cn"+hs[i].get('href')
    print(htmls)
    url=htmls
    htm11=requests.get(url)
    htm11.encoding="utf-8"
    br=BeautifulSoup(htm11.text,'lxml')
    bt=br.select(".biaoti h1")
    print(len(bt))
    sj=br.select(".jiathis_txt")
    zw=br.select(".wp_articlecontent")
    print("=======标题=======")
    print(bt[0].string)
    print("=======时间=======")
    print(sj[0].getText())
    print("=======正文=======")
    print(zw[0].getText())
    ss=bt[0].string+"\n"+sj[0].getText()+"\n"+zw[0].getText()+"\n"+"==========================="+"\n"
    count=count+ss
with open("big.txt","w",encoding="utf-8") as f:
    f.write(count)

8888888888888888888888888888888888888888888888888888888888888



urls="

http://www.zjitc.net"+hr[i].get('href').lstrip('..')


urls="

http://www.zjtie.edu.cn/"+hr.lstrip("../..")
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值