Python爬取法规

最新推荐文章于 2023-04-25 22:25:02 发布

撸码的徐哥

最新推荐文章于 2023-04-25 22:25:02 发布

阅读量1.5k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_41922379/article/details/85092607

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

#_*_ coding:utf-8 _*_  
import re  
import urllib.request
import time
import requests
import os
from bs4 import BeautifulSoup
import json
import lxml  
user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'  
headers={'User-Agent':user_agent}
x=0
#循环爬取1到20页的视频
ursl='http://www.szpl.gov.cn/xxgk/zcwj/zcfg/index.html'
reqst=urllib.request.Request(url=ursl,headers=headers)#headers为伪装的用户登录
page1=urllib.request.urlopen(reqst,timeout=60)#打开链接网页的请求,timeout为请求超过几秒就跳过进行下一步
smthtmls=page1.read().decode('gb2312')
seinfo=lxml.etree.HTML(smthtmls)
product_infos=seinfo.xpath('//ul[@class="pagination"]/script/text()')[0]
p1 = re.compile(r'[(](.*?)[)]', re.S)
p2=re.findall(p1, product_infos)[0]
sli=p2.split(',')
yepage=sli[0]
intinfo=int(yepage)
for i in range(1,intinfo):
    try:
        s=i
        file=open("G:/szpltest.txt","a+")#读模式（'r'）、写模式（'w'）、追加模式（'a'）
        print("开始打印第"+str(i)+"页")
        req_url_base='http://www.szpl.gov.cn/xxgk/zcwj/zcfg/'
        req_url='http://www.szpl.gov.cn/xxgk/zcwj/zcfg/'
        if i==1:
            req_url_base = req_url_base + "index.html"
        else:
            i = i - 1
            req_url_base = req_url_base + "index_" + str(i) + ".html"
        request=urllib.request.Request(url=req_url_base,headers=headers)#headers为伪装的用户登录
        print("开始链接第"+str(s)+"页")  
        page=urllib.request.urlopen(request,timeout=60)#打开链接网页的请求,timeout为请求超过几秒就跳过进行下一步
        html=page.read().decode('gb2312')
        selector=lxml.etree.HTML(html)
        print("开始得到第"+str(s)+"页内容")
        product_infos=selector.xpath('//ul[@class="list-group"]/li')
        for product in product_infos:
            try:
                Timewera=product.xpath('span[@class="hidden-sm hidden-xs"]/text()')[0]
                Child_url=product.xpath('a/@href')[0]
                ChildText=product.xpath('a/text()')[0]
                print(Timewera+"          "+Child_url+"          "+ChildText)
                file.write("第"+str(s)+"页的数据为："+Timewera+"    "+Child_url+"    "+ChildText+"\n")
                Houzui=Child_url.replace('.','',1)
                surl=req_url+Houzui
                requestdetails=urllib.request.Request(url=surl,headers=headers)#载入某一行的具体详细界面
                PageDetals=urllib.request.urlopen(requestdetails,timeout=60)#打开链接网页的请求,timeout为请求超过几秒就跳过进行下一步
                HtmlDetails=PageDetals.read().decode('gb2312')
                ForDetails=lxml.etree.HTML(HtmlDetails)
                soup=BeautifulSoup(HtmlDetails)
                question=soup.findAll('font',{'id':"Zoom"})
                print (question)
                #prodect_detalis=ForDetails.xpath('//font[@id="Zoom"]')[0]#页面的详细
                file.write(str(question))
                #for pdetail in prodect_detalis:
                 #   try:
                  #      pinfo=pdetail.text
                   #     print(pinfo)
                    #    file.write(pinfo+"\n")
                    #except Exception as e:
                     #   print(e) 
            except Exception as e:
                print(e)
        file.close()
        time.sleep(3)
    except Exception as e:
        print(e)