目标网站:站长素材
抓取要求:
1.根据输入页页码进行抓取响应内容
2.根据页码建立文件夹存储本页内容
3.输出控制台正在抓取的内容
使用到的库:
import requests #请求网页
from lxml import html #解析网页源码
import threadpool #多线程
from multiprocessing.pool import Pool #多进程
import os #创建文件夹
完整代码:
import requests #请求网页
from lxml import html #解析网页源码
import threadpool #多线程
from multiprocessing.pool import Pool #多进程
import os #创建文件夹
header = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
}
#在前面加上
requests.packages.urllib3.disable_warnings() #移除网页警告 verify=False移除ssl证书
datalist = []
def one(data):
url = data['url']
page = data['page']
try:
os.mkdir('datas/%s'%page)
print('文件夹'+page+'创建成功')
except:
print('文件夹已存在')
res = requests.get(url,headers=header,verify=False).text
dom = html.etree.HTML(res)
hrefs = dom.xpath('//*[@id="container"]/div/a/@href')
datalist.append({
'page':page,
'hrefs':hrefs
})
# print(hrefs)
pool = threadpool.ThreadPool(10) #开启的进程数量
requ = threadpool.makeRequests(two,datalist)
[pool.putRequest(req) for req in requ]
pool.wait()
def two(data):
# print(data)
page = data['page']
hrefs = data['hrefs']
for i in hrefs:
href = 'http:'+i
# print(href)
res = requests.get(href,headers=header,verify=False)
res.encoding = res.apparent_encoding
dome = html.etree.HTML(res.text)
title = dome.xpath('//*[@class="ppt_left fl"]/div/div/h1/text()')[0]
# print(title)
down_url = dome.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
# print(down_url)
con = requests.get(down_url,headers=header,verify=False).content
with open('datas/%s/%s.rar'%(page,title),'wb') as f:
f.write(con)
print('存储完成:%s'%title)
def main():
list = []
start = int(input('输入开始页:'))
end = int(input('输入结束页:'))
for i in range(start,end+1):
url1 = 'https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&issale=&classID=864&'
page = 'page=%s'%i
url = url1+page
pg = str(url).split('=')[-1]
# print(pg)
# print(url[-2:-1])
list.append({
'url':url,
'page':pg
})
po = Pool(5) #开启的线程数量
po.map(one, (list))
if __name__ =='__main__':
main()
效果: