# @Time:2021-9-27 15:21
# coding:utf-8
import re
import requests
import time
from lxml import etree
domain = "http://www.bzmfxz.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
f = "lucky.txt"
with open(f,"a") as file:
page = 68
while page>0:
# page = 1
url = 'http://www.bzmfxz.com/biaozhun/Soft/YDTXBZ/List_' + str(page) + '.html'
# url = "http://www.bzmfxz.com/biaozhun/Soft/YDTXBZ/List_1.html"
print(url)
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8' # 指定字符集
resp.close()
# print(resp.text)
html = etree.HTML(resp.text)
# lss = html.xpath('//*[@id="main_right_box"]/div[2]/div[3]/div[1]/div')
lss = html.xpath('//*[@id="main_right_box"]/div[2]/div[3]/div/div')
for liis in lss:
title = liis.xpath("./a/text()")
dec = liis.xpath("./a/@href")
title = list(item.replace("\r\n", "")for item in title)
decref = list(item.replace("/biaozhun", "http://www.bzmfxz.com/biaozhun")for item in dec)
# resp2 = requests.get(decref, headers=headers)
decrefdw = list(item.replace("/biaozhun", "http://www.bzmfxz.com/biaozhun")for item in decref)
# ['http://www.bzmfxz.com/Common/ShowDownloadUrl.aspx?urlid=0&id=26420']
# ['http://www.bzmfxz.com/biaozhun/Soft/YDTXBZ/2008/01/31/26420.html']
if len(title) > 0:
newTitle = title[0]
newDecUrl = decref[0]
test = re.search(r'([^/]+)\.[h]', newDecUrl)
newDecDownUrl = "http://www.bzmfxz.com/Common/ShowDownloadUrl.aspx?urlid=0&id=" + test.group(1)
if len(newDecDownUrl) > 0:
response = requests.get(newDecDownUrl)
html = response.content
datazip = etree.HTML(html).xpath('//*[@id="content"]/table/tr/td/a/@href')[0]
file.write(datazip + " " + "\n")
# time.sleep(2)
# print(newTitle)
# print(newDecUrl)
# print(newDecDownUrl)
# print("——————————————————")
file.write(newTitle + " " + "\n")
file.write(newDecUrl + " " + "\n")
file.write(newDecDownUrl + " " + "\n")
file.write("——————————————————" + "\n")
# print(decref)
page -=1
time.sleep(2)