(1)(定位标签)
(2)(提取数据)
1.将糗图百科中前5页的图片进行下载
import requests
from urllib import request
from lxml import etree
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36"
}
url = 'https://www.qiushibaike.com/pic/page/{}/?s=5196770'
for page in range(1, 6):
if page == 1:
new_url = 'https://www.qiushibaike.com/pic'
else:
new_url = url.format(page)
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="content-left"]/div')
for div in div_list:
img_name = div.xpath('./div[2]/a/img/@alt')[0]
img_url = 'https:' + div.xpath('./div[2]/a/img/@src')[0]
img_path = './day118/01/' + img_name + '.jpg'
# 用request(不是requests)的这个方法可以直接下载保存文件,第一个参数是下载地址,第二个是下载到本地的路径+文件名
request.urlretrieve(img_url, img_path)
print(img_name, '下载成功')
2.爬取boss相关的爬虫岗位信息(详情页)
import requests
from lxml import etree
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36"
}
url = 'https://www.zhipin.com/c101010100/?query=%E7%88%AC%E8%99%AB&page=1&ka=page-1'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
job_url = 'https://www.zhipin.com' + li.xpath('.//h3/a/@href')[0]
print(job_url)
job_text = requests.get(url=job_url, headers=headers).text
job_tree = etree.HTML(job_text)
job_name = job_tree.xpath('//div[@class="smallbanner"]/div[1]/div[2]/div[1]/h1/text()')[0]
job_salary = job_tree.xpath('//div[@class="smallbanner"]/div[1]/div[2]/div[1]/span/text()')[0]
# 第三个工作到第七个工作和之前的html文件不一样,用管道符取出来,并取列表中的最后一项,就是公司名称
job_addr = job_tree.xpath(
'//div[@class="detail-content"]/div[5]/div[@class="name"]/text() |//*[@id="main"]/div[3]/div/div[2]/div[2]/div[4]/div[1]/text()')[
-1]
# 打印每个工作的岗位,薪资和公司名称
print(job_name, job_salary, job_addr)
3.爬取(站长素材免费建立模板下载)(前10页)http://sc.chinaz.com/jianli/free.html
import requests
import random
from lxml import etree
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36"
}
url = 'http://sc.chinaz.com/jianli/free_{}.html'
for page in range(1, 11):
if page == 1:
new_url = 'http://sc.chinaz.com/jianli/free.html'
else:
new_url = url.format(page)
response = requests.get(url=new_url, headers=headers)
#解决编码(乱码)问题
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
# 简历模板的名字,可以解决乱码问题(第二种方法) res_name = res_name.encode('iso-8859-1').decode('gbk')
res_name = div.xpath('./a/img/@alt')[0]
# 文件存放的路径
res_path = './day118/03/' + res_name + '.rar'
# 简历模板的url
res_url = div.xpath('./a/@href')[0]
# 请求简历模板的网址
res_text = requests.get(url=res_url, headers=headers).text
# 用etree来进行数据分析
res_tree = etree.HTML(res_text)
# 简历下载的网络地址列表(福建电信,厦门电信)
down_list = res_tree.xpath('//div[@id="down"]/div[2]//a/@href')
# 随机抽取一个下载地址
down_url = random.choice(down_list)
# 请求下载地址,因为是压缩包,二进制文件,所以用content,不能用text!!!!!!
down_res = requests.get(url=down_url, headers=headers).content
print(res_path)
# 写入的是二进制的数据,(所以不能写encoding='utf-8')
with open(res_path, 'wb') as f:
f.write(down_res)
print(res_name, '下载完毕')