匹配不到内容多看看源代码ctrl+F,是不是格式与审查元素中标签格式一致,还有大小写,单双引号。
使用xpath和BeautifulSoup会相对简单一些,容易定位到文本的标签所在的内容。
正则在使用时,search找到span的开始与结尾,就可以定位到文本的所在。
而想要使用p>(.*?)<p>'或<p>(.*?)</p>
,会有部分内容搜索不到。所以使用切割源代码中想要的部分即可,全部获取。
<div id='cms_content_div'><p style="text-align: center;">**<IMG src="/_CMS_NEWS_IMG_/upload1/20170609/16311496987245598.jpg" title="2.jpg"** hspace="5" vspace="5"/></p>
import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
import time
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
def parse_page(url):
response = requests.get(url,headers=headers)
text = response.content.decode('utf-8','ignore')
time.sleep(0.2)
return text #返回源代码
def parse_detail_page(url):
''' 也可以用xpath或BeautifulSoup
soup = BeautifulSoup(text,'html.parser')
english_content = soup.find('div',id='cms_content_div')
html = etree.HTML(resp)
english_content = html.xpath('//div[@id="cms_content_div"]//text()')
title = html.xpath('//div[@class="display"]/h1//text()')[0].split('_')[-1]
'''
try:
html =parse_page(url)
time.sleep(0.2)
reg = r'<div.+?cms_content_div.+?>'
reg2 = r'<script.+?javascript.+?>'
# ~ reg3 = r'<strong.+?margin.+?>'
div = re.search(reg,html)
div2 = re.search(reg2,html)
start = div.start() #用search源代码,来定位内容的起始
end =div2.end()
date = re.findall(r'<h1>(.*?)</h1>',html)[0]
html = html[start:end]
html=re.sub(r' |'','\'',re.sub(r'<.*?>|从2017.*|外语广播.*','',html))#替换多余部分, 转换成单引号
html=date+'\n'+html+'\n'
return html
except UnicodeDecodeError as e:
print(e)
pass
def parse_pic(url):
html =parse_page(url)
pic = re.findall(r'<IMG src="(.*?)" title',html,re.S)[0]#''看源代码是单引号,且是大写
pic_url ='http://www.rbc.cn'+str(pic)
return pic_url
def main():
pagenums = []
endpage = int(input('how many pages ?')) #输入共有多少页音频节目
for i in range(52,endpage+1):
otherpage = 'n_%d' % i #其它分页的格式
pagenums.append(otherpage)
for pagenum in pagenums:
url = 'http://www.rbc.cn/audio/yingyuzaoca' + pagenum + '.shtml?parent=2972&node=3165' #拼成所有分页的URL
page = parse_page(url)
part_urls = re.findall(r"<li>.*?</span>.*?href='(.*?)'.*?target",page,re.S)
for j in part_urls: #''看源代码是单引号
detail_url = 'http://www.rbc.cn/'+str(j)
info =parse_detail_page(detail_url)
try:
pic = parse_pic(detail_url)
if pic:
name = pic.split('/')[-1]
r= requests.get(pic,headers=headers)
with open(name,'wb') as f:
f.write(r.content)
print(pic,"图片完成下载")
except:
pass
try:
with open('en.txt','a',encoding="gb18030") as f:
f.write(info)
print(j,"文本完成下载")
except UnicodeEncodeError as e:
print("UnicodeEncodeError details: " +str(e)+j) #显示到底是哪里出错了
pass
if __name__ =="__main__":
main()