python爬虫js反爬案例

最新推荐文章于 2024-02-26 13:39:40 发布

@小时候可乖了@

最新推荐文章于 2024-02-26 13:39:40 发布

阅读量789

点赞数 2

文章标签： python

本文链接：https://blog.csdn.net/mostermoonsky/article/details/104138616

版权

python爬虫js反爬案例

爬取民政部行政区代码

import requests
from lxml import etree
import re
from bs4 import BeautifulSoup
url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

response =requests.get(url,headers = headers)
html = response.text  #经过格式化之后的html字符串
parse_html = etree.HTML(html)

article_list = parse_html.xpath('//a[@class="artitlelist"]')
new_link = ''
for article in article_list:
    title = article.xpath('./@title')[0]
    if title.endswith('代码'):
        link = article.xpath('./@href')[0]
        new_link = 'http://www.mca.gov.cn' + link
        break  #爬取最新的链接，找到后直接停止循环

two_html = requests.get(new_link,headers = headers).text
link2 = re.findall('window.location.href="(.*?)"',two_html)[0]  #得到js跳转的链接

response2 = requests.get(link2,headers = headers)
html = response2.text
soup = BeautifulSoup(html,'html.parser')
aa = soup.select("tr[style='mso-height-source:userset;height:14.25pt']")
for a in aa:
    target = a.select("td[class=xl7128029]")
    if len(target) >0:
        print(target[0].string,re.findall('[\u4e00-\u9fa5]{1,10}',str(target[1])))

#爬取效果展示
110101 ['东城区']
110102 ['西城区']
110105 ['朝阳区']
110106 ['丰台区']
110107 ['石景山区']
110108 ['海淀区']
110109 ['门头沟区']
110111 ['房山区']
110112 ['通州区']