python爬虫js反爬案例
爬取民政部行政区代码
import requests
from lxml import etree
import re
from bs4 import BeautifulSoup
url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
response =requests.get(url,headers = headers)
html = response.text #经过格式化之后的html字符串
parse_html = etree.HTML(html)
article_list = parse_html.xpath('//a[@class="artitlelist"]')
new_link = ''
for article in article_list:
title = article.xpath('./@title')[0]
if title.endswith('代码'):
link = article.xpath('./@href')[0]
new_link = 'http://www.mca.gov.cn' + link
break #爬取最新的链接,找到后直接停止循环
two_html = requests.get(new_link,headers = headers).text
link2 = re.findall('window.location.href="(.*?)"',two_html)[0] #得到js跳转的链接
response2 = requests.get(link2,headers = headers)
html = response2.text
soup = BeautifulSoup(html,'html.parser')
aa = soup.select("tr[style='mso-height-source:userset;height:14.25pt']")
for a in aa:
target = a.select("td[class=xl7128029]")
if len(target) >0:
print(target[0].string,re.findall('[\u4e00-\u9fa5]{1,10}',str(target[1])))
#爬取效果展示
110101 ['东城区']
110102 ['西城区']
110105 ['朝阳区']
110106 ['丰台区']
110107 ['石景山区']
110108 ['海淀区']
110109 ['门头沟区']
110111 ['房山区']
110112 ['通州区']
参考链接:https://www.jb51.net/article/169322.html