一、Beautiful Soup
1、比较全面的介绍链接:
https://cuiqingcai.com/1319.html
2、爬虫示例
QQ音乐下载(如有侵权或非法之嫌,请联系我,我自会删除)
https://y.qq.com/
整体思路:随便进入音乐播放页面,通过Ctrl + Shift + I 打开检查——Network——Media,可以查看到所播放文件的资源路径以及各项参数。链接地址是音乐文件的二进制流,通过程序获取后保存到本地即可。
在获取下载链接时,需要要到的参数是Header——Query String Parameters中的vkey和guid。
例如:http://111.202.85.147/amobile.music.tc.qq.com/C400003eum8Z3yRRXm.m4a?guid=8372245812&vkey=6096760234AFB3ECE108D2EDF68B829EA5B4EF4B6E3557F8F9AEE63E2239E7960BC13B4A6CDAA712D78BBCBD25C05FD4DDF4BE4ECBDEABA5&uin=0&fromtag=66
把以上链接拆开:
http://111.202.85.147/amobile.music.tc.qq.com/
C400003eum8Z3yRRXm.m4a
?
guid=8372245812
&
vkey=6096760234AFB3ECE108D2EDF68B829EA5B4EF4B6E3557F8F9AEE63E2239E7960BC13B4A6CDAA712D78BBCBD25C05FD4DDF4BE4ECBDEABA5
&
uin=0
&
fromtag=66
需要更改的参数是音乐的IDC400003eum8Z3yRRXm.m4a和密钥vkey的值,其他值则是固定的。
这两个值可以从Network——JS中的文件查找——fcg文件中找,通过不断的搜寻,最后就可以找到规律进行URL拼接,最后请求链接将返回的数据保存即可。
import requests
import json
# 入口地址(暂时不考虑分页)
start_url='https://c.y.qq.com/splcloud/fcgi-bin/fcg_get_diss_by_tag.fcg?picmid=1&rnd=0.5763118180791627&g_tk=1806862358&jsonpCallback=a&loginUin=1009137312&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0&categoryId=10000000&sortId=5&sin=0&ein=29'
# 伪装
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Referer':'https://y.qq.com/portal/playlist.html',
'Host':'y.qq.com',
#'Cookie':''
}
html=requests.get(start_url,headers=headers).text
json_dict=json.loads(html.strip('a()'))
# 遍历得到dissid
dissid_list=[]
for item in json_dict['data']['list']:
dissid_list.append(item['dissid'])
# 循环dissid_list 进行连接的拼接
get_songmid_url_list=[]
for dissid in dissid_list:
#返回songmid数据的链接
url="https://c.y.qq.com/qzone/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid={0}&format=jsonp&g_tk=1806862358&jsonpCallback=playlistinfoCallback&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0".format(dissid)
get_songmid_url_list.append(url)
智联招聘网站(如有侵权或非法之嫌,请联系我,我自会删除)
#-*- conding:utf-8 -*-
''''''
from lxml import etree
import requests
import re
import time
'''
需求分析
1.https://sou.zhaopin.com/进入职位搜索获取职位分类列表
2.通过分类进入职位详细列表
3.进入职位详细信息页面
获取详细的职位信息
源码分析
代码实现
'''
#1.获取职位分类列表 url
def get_job_cat_list(url,headers):
r = requests.get(url,headers= headers).text
# 解析
index = etree.HTML(r)
#获取分类列表的url
job_url = index.xpath('//div[@id="search_right_demo"]/div/div/a/@href')
#替换参数
pattern = re.compile('jl=\d+&')
new_job_url = [url[:-1]+pattern.sub('jl=489&',i) for i in job_url]
return new_job_url
#2.获取职位列表
def get_job_list(url,headers):
#发送请求
result = requests.get(url,headers = headers).text
#解析
job_list = etree.HTML(result)
#获取职位信息列表的 url
job_url = job_list.xpath('//div[@id="newlist_list_content_table"]/table/tr[1]/td[1]/div/a[1]/@href')
#获取下一页
next_page = job_list.xpath('//a[@class="next-page"]/@href')
return job_url,next_page
#3.获取职位详细信息
def get_job_info(url):
#发送请求
r = requests.get(url).text
#解析
info = etree.HTML(r)
dic = {}
#职位名称 zwmc @calss = inner-left fl or @class= fl
dic['zwmc'] = info.xpath('string(//div[@class="inner-left fl" or @class="fl"]/h1)')
#公司名称 gsmc
dic['gsmc'] = info.xpath('string(//div[@class="inner-left fl" or @class="fl"]/h2)')
#公司福利 gsfl
dic['gsfl'] = info.xpath('//div[@class="welfare-tab-box"]/span/text()')
#职位月薪 zwyx
dic['zwyx'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[1]/strong)')
# gzdd
dic['gzdd'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[2]/strong)')
dic['fbrq'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[3]/strong)')
dic['gzxz'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[4]/strong)')
dic['gzjy'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[5]/strong)')
dic['zdxl'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[6]/strong)')
dic['zprs'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[7]/strong)')
dic['zwlb'] = info.xpath('string(//div[@class="terminalpage-left"]/ul/li[8]/strong)')
jobs_info = clear_none(dic)
if jobs_info:
jobs_infos = clear_data(jobs_info)
save_data(jobs_infos)
#过滤
zw_lis = []
def clear_none(data):
if data['zwmc'] == '' or data['zwmc'] in zw_lis:
return False
else:
zw_lis.append(data['zwmc'])
return data
#清洗
def clear_data(data):
#gsfl
data['gsfl'] = '_'.join([str(i) for i in data['gsfl']])
#zwyx min_zwyx max_main
patten = re.compile('\d+')
zwyxlis = patten.findall(data['zwyx'])
if len(zwyxlis) == 2:
data['min_zwyx'] = zwyxlis[0]
data['max_zwyx'] = zwyxlis[1]
data.pop('zwyx')
elif '面议' == data['zwyx']:
data['min_zwyx'] = data['max_zwyx'] = 0
else:
data['min_zwyx'] = data['max_zwyx'] = zwyxlis[0]
#gzdd
data['gzdd'] = data['gzdd'].split('-')[0]
#zprs
data['zprs'] = data['zprs'].strip('人 ')
#fbrq
time_tup = time.strptime(data['fbrq'],'%Y-%m-%d %H:%M:%S')
data['fbrq'] = time.strftime('%Y-%m-%d',time_tup)
return data
#4.保存数据
def save_data(data):
datas = ','.join([str(i) for i in data.values()])
print(datas)
with open('zlzp.csv','a+',encoding='utf-8') as file:
file.write(datas+'\n')
if __name__=='__main__':
#入口地址
url = 'http://sou.zhaopin.com/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
# 'Referer': 'https: // www.zhaopin.com /',
}
with open('zlzp.csv','w',encoding='utf-8') as file:
file.write('zwmc,gsmc,gsfl,zwyx,gzdd,fbrq,gzxz,gzjy,zdxl,zprs,zwlb\n')
#a. get_job_cat_list 获取分类
job_cat_list = get_job_cat_list(url,headers)
# print(job_cat_list)
#b.get_job_list 获取职位列表
# for x in job_cat_list:
job_list,next_page = get_job_list(job_cat_list[0],headers = headers)
#
# print(job_list)
#c.get_job_info 获取职位详细信息
for i in job_list:
get_job_info(i)
博客搜索
#-*- conding:utf-8 -*-
''''''
from lxml import etree
import requests
'''
需求分析
爬取博客园的贴子
源码的分析
https://www.cnblogs.com/
tz = post_item_body
title = cb_post_title_url
content = cnblogs_post_body
代码实现
1.根据入口url请求源码
2.提取数据(每篇帖子的url)
3.根据帖子url进入到帖子详情,获取详细内容
4.保存数据
'''
#1.根据入口url请求源码
url = 'https://www.cnblogs.com/'
nwo_url = url
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
num = 1
page = 1
while True:
r = requests.get(nwo_url,headers).text
# print(r)
#解析
index = etree.HTML(r)
# print(index)
#2.提取数据(每篇帖子的url)
tz_url = index.xpath('//div[@class="post_item_body"]/h3/a/@href')
next_url = index.xpath('//div[@class="pager"]/a[last()]')
# print(next_url[0].xpath('@href'))
# print(next_url[0].xpath('text()'))
#3.根据帖子url进入到帖子详情,获取详细内容
for i in tz_url:
re = requests.get(i).text
html = etree.HTML(re)
#提取标题和内容
tz_title = html.xpath('//a[@id="cb_post_title_url"]/text()') #list
# print(tz_title)
tz_content = html.xpath('string(//*[@id="cnblogs_post_body"])') #str
# print(tz_title)
#保存内容
with open('cn-blogs.csv','a+',encoding='utf-8') as file:
file.write(tz_title[0]+'\n')
file.write(tz_content+'\n')
file.write(i+'\n')
file.write('*'*50+'\n')
print('{0}页第{1}篇帖子'.format(page,num))
num += 1
if next_url[0].xpath('text()')[0] == 'Next >':
nwo_url = url[:-1]+next_url[0].xpath('@href')[0]
print(nwo_url)
page+=1
num=1
print(page)
else:
break