#_*_ coding:utf-8 _*_
import re
import urllib.request
import time
import requests
import os
from bs4 import BeautifulSoup
import json
import lxml
user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'
headers={'User-Agent':user_agent}
x=0
#循环爬取1到20页的视频
ursl='http://www.szpl.gov.cn/xxgk/zcwj/zcfg/index.html'
reqst=urllib.request.Request(url=ursl,headers=headers)#headers为伪装的用户登录
page1=urllib.request.urlopen(reqst,timeout=60)#打开链接网页的请求,timeout为请求超过几秒就跳过进行下一步
smthtmls=page1.read().decode('gb2312')
seinfo=lxml.etree.HTML(smthtmls)
product_infos=seinfo.xpath('//ul[@class="pagination"]/script/text()')[0]
p1 = re.compile(r'[(](.*?)[)]', re.S)
p2=re.findall(p1, product_infos)[0]
sli=p2.split(',')
yepage=sli[0]
intinfo=int(yepage)
for i in range(1,intinfo):
try:
s=i
file=open("G:/szpltest.txt","a+")#读模式('r')、写模式('w')、追加模式('a')
print("开始打印第"+str(i)+"页")
req_url_base='http://www.szpl.gov.cn/xxgk/zcwj/zcfg/'
req_url='http://www.szpl.gov.cn/xxgk/zcwj/zcfg/'
if i==1:
req_url_base = req_url_base + "index.html"
else:
i = i - 1
req_url_base = req_url_base + "index_" + str(i) + ".html"
request=urllib.request.Request(url=req_url_base,headers=headers)#headers为伪装的用户登录
print("开始链接第"+str(s)+"页")
page=urllib.request.urlopen(request,timeout=60)#打开链接网页的请求,timeout为请求超过几秒就跳过进行下一步
html=page.read().decode('gb2312')
selector=lxml.etree.HTML(html)
print("开始得到第"+str(s)+"页内容")
product_infos=selector.xpath('//ul[@class="list-group"]/li')
for product in product_infos:
try:
Timewera=product.xpath('span[@class="hidden-sm hidden-xs"]/text()')[0]
Child_url=product.xpath('a/@href')[0]
ChildText=product.xpath('a/text()')[0]
print(Timewera+" "+Child_url+" "+ChildText)
file.write("第"+str(s)+"页的数据为:"+Timewera+" "+Child_url+" "+ChildText+"\n")
Houzui=Child_url.replace('.','',1)
surl=req_url+Houzui
requestdetails=urllib.request.Request(url=surl,headers=headers)#载入某一行的具体详细界面
PageDetals=urllib.request.urlopen(requestdetails,timeout=60)#打开链接网页的请求,timeout为请求超过几秒就跳过进行下一步
HtmlDetails=PageDetals.read().decode('gb2312')
ForDetails=lxml.etree.HTML(HtmlDetails)
soup=BeautifulSoup(HtmlDetails)
question=soup.findAll('font',{'id':"Zoom"})
print (question)
#prodect_detalis=ForDetails.xpath('//font[@id="Zoom"]')[0]#页面的详细
file.write(str(question))
#for pdetail in prodect_detalis:
# try:
# pinfo=pdetail.text
# print(pinfo)
# file.write(pinfo+"\n")
#except Exception as e:
# print(e)
except Exception as e:
print(e)
file.close()
time.sleep(3)
except Exception as e:
print(e)