爬取新浪新闻
1.对新浪网页面进行分析
url = https://news.sina.com.cn/
经分析新浪新闻主页没有XHR加载的数据
分析网页结构,爬出所有新闻的html子界面
根据分析得出,所有的链接藏在a标签中。我们需要做的就是向a标签发起请求,爬取a标签的所有信息。
其中新浪新闻的标签的url类似于
2.对新浪网的子页新闻进行分析
import os
import requests
import json
from lxml import etree
import re
# 判断目录名是否有效
# def is_valid_directory_name(directory_name):
# pattern = r'^[a-zA-Z0-9_\-]+$' # 只允许字母、数字、下划线和短横线
# return bool(re.match(pattern, directory_name))
if __name__ == '__main__':
os.mkdir('data')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0'
}
pagenum = 1;
list_t = []
# while (pagenum <= 30):
# 1.获取第一页的html数据
url = 'https://news.sina.com.cn/'
page_text = requests.get(url=url, headers=headers)
# 2.数据进行编码设置
page_text.encoding = 'utf-8'
# 3.对数据进行解析
tree = etree.HTML(page_text.text)
list_t.append(tree.xpath('//a/@href')) # 标记所有网页的a标签,获取其内部url
pagenum += 1;
list_all_right = []
# 4.持久化操作
for i in list_t:
for new_url in i:
# 5.获取所有的地址。
# 5.设置新的url进行再一次请求
if (new_url[:len(url)] == url and new_url[-5:] == 'shtml' and len(new_url) < 99):
print(new_url)
list_all_right.append(new_url)
list_all_right = list(set(list_all_right))
for m_url in list_all_right:
response = requests.get(url=m_url, headers=headers)
# 6.新的请求一样需要得到其编码格式
response.encoding = 'utf-8'
newTree = etree.HTML(response.text)
title = newTree.xpath('//h1[@class="main-title"]/text()')
p = newTree.xpath('//div[@id="article"]/p/text()') #此处的p是一个列表
img = newTree.xpath('//div[@class="article-content-left"]//img/@src') #img也是一个列表
if isinstance(title, list):
title = ''.join(title)
if title!='':
os.mkdir('data/'+title)
with open('data/'+title+'/'+title + '.json', 'w', encoding='utf-8') as fp:
data={
}
data["main_title"]=title
data["date"]=newTree.xpath('//span[@class="date"]/text()')
data["source"]=newTree.xpath('//a[@class="source"]/text()')
# 输入正文
print(title)
tem = 1
for i in p:
data["p"+str(tem)]=i
tem+=1
json.dump(data,fp,ensure_ascii=False, indent=4)
# fp.write(title + p)
fp.close()
temp = 1
# for i in img:
# url=i
# if(i[:7]=='//https'):
# i=i[7:]
# if (i[:5] == 'https'):
# i = i[5:]
# url='http://'+i
# imagedata = requests.get(url=url,headers=headers).content
# with open(title+'/img_'+temp+'.jpg','wb') as fp:
# fp.write(imagedata)
# fp.close()