数据集:
https://archive.ics.uci.edu/ml/index.php
https://www.kaggle.com/datasets
**Python爬虫技能**:
- 静态网页数据抓取(urllib/requests/BeautifulSoup/lxml)
- 动态网页数据抓取(ajax/phantomjs/selenium)
- 爬虫框架(scrapy)
- 补充知识:前端知识、数据库知识、文本处理技术
**Python爬虫四步基本框架**
- 请求 urllib/requests
- 解析 BeautifulSoup/lxml
- 提取 css选择器/xpath表达式/正则表达式
- 存储 csv/MySQL/mongoDB等
### urllib: python的标准库,提供了一系列操作URL的功能
<b> 直接使用urllib请求页面 </b>
from urllib.request import urlopen url = "https://www.python.org/" response = urlopen(url) content = response.read() # 需要解码 content = content.decode('utf-8') print(content)
# 直接urlopen打开的方式太直白,有时候我们需要委婉一点进行请求
import urllib.request
url = "https://www.python.org/"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
#print(content)
print(response.geturl())
print(response.info())
### 打印请求状态码
print(response.getcode())
print(type(response))
### requests请求库
import requests
res = requests.get('https://www.python.org/')
print(res.status_code)
print(res.text)
#print(res.content) res.content和res.text区别不大,我理解要看纯文本用text
#%% md
#### 设置请求头headers
#%%
url = 'https://www.python.org/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
res = requests.get(url, headers=headers)
print(res)
#%% md
**requests请求方法**
- get
- post
#%% md
### 解析库 BeautifulSoup
#%% md
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.
#%%
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
url = 'http://news.qq.com/'
Soup = BeautifulSoup(requests.get(url=url, headers=headers).text.encode("utf-8"), 'lxml') # 注意此处要看一下网页源码的编码形式,我的是GB2312
em = Soup.find_all('em', attrs={'class': 'f14 l24'})
for i in em:
title = i.a.get_text()
link = i.a['href']
print({'标题': title,
'链接': link
})
#%% md
### 解析库 lxml
#%%
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
url = 'http://news.qq.com/'
html = requests.get(url = url, headers = headers)
con = etree.HTML(html.text)
title = con.xpath('//em[@class="f14 l24"]/a/text()')
link = con.xpath('//em[@class="f14 l24"]/a/@href')
for i in zip(title, link):
print({'标题': i[0],
'链接': i[1]
})
#%% md
### 信息提取方式
- css选择器:select方法
- xpath表达式
- 正则表达式
#%%
# select method
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
url = 'http://news.qq.com/'Soup = BeautifulSoup(requests.get(url=url, headers=headers).text.encode("utf-8"), 'lxml')
em = Soup.select('em[class="f14 l24"] a')
for i in em:
title = i.get_text()
link = i['href']
print({'标题': title,
'链接': link
})
#%%
# xpath表达式
import requests
import lxml.html as HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
url = 'http://news.qq.com/'
con = HTML.fromstring(requests.get(url = url, headers = headers).text)
title = con.xpath('//em[@class="f14 l24"]/a/text()')
link = con.xpath('//em[@class="f14 l24"]/a/@href')
for i in zip(title, link):
print({'标题': i[0],'链接': i[1]
})
#%% md
### 静态数据采集:拉勾网
#%%
# 导入相关库
import requests
from lxml import etree
import pandas as pd
from time import sleep
import random
# cookie
cookie = '你的cookie'
# headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Cookie': 'cookie'
}
#%%
# 查看网页结构循环页数进行采集
for i in range(1, 6):
sleep(random.randint(3, 10))
url = 'https://www.lagou.com/zhaopin/jiqixuexi/{}/?filterOption=3'.format(i)
print('正在抓取第{}页...'.format(i), url)
# 请求网页并解析
con = etree.HTML(requests.get(url=url, headers=headers).text)
# 使用xpath表达式抽取各目标字段
job_name = [i for i in con.xpath("//a[@class='position_link']/h3/text()")]
job_address = [i for i in con.xpath("//a[@class='position_link']/span/em/text()")]
job_company = [i for i in con.xpath("//div[@class='company_name']/a/text()")]
job_salary = [i for i in con.xpath("//span[@class='money']/text()")]
job_exp_edu = [i for i in con.xpath("//div[@class='li_b_l']/text()")]
job_exp_edu2 = [i for i in [i.strip() for i in job_exp_edu] if i != '']
job_industry = [i for i in con.xpath("//div[@class='industry']/text()")]
job_tempation = [i for i in con.xpath("//div[@class='list_item_bot']/div[@class='li_b_r']/text()")]
job_links = [i for i in con.xpath("//div[@class='p_top']/a/@href")]
# 获取详情页链接后采集详情页岗位描述信息
job_des = []
for link in job_links:
sleep(random.randint(3, 10))
#print(link)
con2 = etree.HTML(requests.get(url=link, headers=headers).text)
des = [[i.xpath('string(.)') for i in con2.xpath("//dd[@class='job_bt']/div/p")]]
job_des += des
break
# 对数据进行字典封装
dataset = {
'岗位名称': job_name,
'工作地址': job_address,
'公司': job_company,
'薪资': job_salary,
'经验学历': job_exp_edu2,
'所属行业': job_industry,
'岗位福利': job_tempation,
'任职要求': job_des
}
# 转化为数据框并存为csv
data = pd.DataFrame(dataset)
data.to_csv('machine_learning_hz_job2.csv')
#%%
data.head()
#%%
# 函数化封装
import requests
from lxml import etree
import pandas as pd
from time import sleep
import random
def static_crawl():
cookie = '你的cookie'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Cookie': 'cookie'
}
for i in range(1, 7):
sleep(random.randint(3, 10))
url = 'https://www.lagou.com/zhaopin/jiqixuexi/{}/?filterOption=3'.format(i)
print('正在抓取第{}页...'.format(i), url)
con = etree.HTML(requests.get(url=url, headers=headers).text)
job_name = [i for i in con.xpath("//a[@class='position_link']/h3/text()")]
job_address = [i for i in con.xpath("//a[@class='position_link']/span/em/text()")]
job_company = [i for i in con.xpath("//div[@class='company_name']/a/text()")]
job_salary = [i for i in con.xpath("//span[@class='money']/text()")]
job_exp_edu = [i for i in con.xpath("//div[@class='li_b_l']/text()")]
job_exp_edu2 = [i for i in [i.strip() for i in job_exp_edu] if i != '']
job_industry = [i for i in con.xpath("//div[@class='industry']/text()")]
job_tempation = [i for i in con.xpath("//div[@class='list_item_bot']/div[@class='li_b_r']/text()")]
job_links = [i for i in con.xpath("//div[@class='p_top']/a/@href")]
job_des = []
for link in job_links:
sleep(random.randint(3, 10))
#print(link)
con2 = etree.HTML(requests.get(url=link, headers=headers).text)
des = [[i.xpath('string(.)') for i in con2.xpath("//dd[@class='job_bt']/div/p")]]
job_des += des
lagou_dict = {
'岗位名称': job_name,
'工作地址': job_address,
'公司': job_company,
'薪资': job_salary,
'经验学历': job_exp_edu2,
'所属行业': job_industry,
'岗位福利': job_tempation,
'任职要求': job_des
}
crawl_data = pd.DataFrame(lagou_dict)
data.to_csv('machine_learning_hz_job2.csv')
return crawl_data
#%% md
### 动态数据采集:拉勾网
#%%
import json
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
#定义抓取主函数
def lagou_dynamic_crawl():
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?px=default&city=%E5%85%A8%E5%9B%BD',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest',
'Cookie': '你的cookie'
}
#创建一个职位列表容器
positions = []
#30页循环遍历抓取
for page in range(1, 31):
print('正在抓取第{}页...'.format(page))
#构建请求表单参数
params = {
'first':'true',
'pn':page,
'kd':'数据挖掘'
}
#构造请求并返回结果
result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false',
headers=headers, data=params)
#将请求结果转为json
json_result = result.json()
#解析json数据结构获取目标信息
position_info = json_result['content']['positionResult']['result']
#循环当前页每一个职位信息,再去爬职位详情页面
for position in position_info:
#把我们要爬取信息放入字典
position_dict = {
'position_name':position['positionName'],
'work_year':position['workYear'],
'education':position['education'],
'salary':position['salary'],
'city':position['city'],
'company_name':position['companyFullName'],
'address':position['businessZones'],
'label':position['companyLabelList'],
'stage':position['financeStage'],
'size':position['companySize'],
'advantage':position['positionAdvantage'],
'industry':position['industryField'],
'industryLables':position['industryLables']
}
#找到职位 ID
position_id = position['positionId']
#根据职位ID调用岗位描述函数获取职位JD
position_dict['position_detail'] = recruit_detail(position_id)
positions.append(position_dict)
time.sleep(4)
print('全部数据采集完毕。')
return positions
#定义抓取岗位描述函数
def recruit_detail(position_id):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?labelWords=&fromSearch=true&suginput=',
'Upgrade-Insecure-Requests':'1',
'Cookie': '你的cookie'
}
url = 'https://www.lagou.com/jobs/%s.html' % position_id
result = requests.get(url, headers=headers)
time.sleep(5)
#解析职位要求text
soup = BeautifulSoup(result.text, 'html.parser')
job_jd = soup.find(class_="job_bt")
#通过尝试发现部分记录描述存在空的情况
#所以这里需要判断处理一下
if job_jd != None:
job_jd = job_jd.text
else:
job_jd = 'null'
return job_jd
if __name__ == '__main__':
positions = lagou_dynamic_crawl()
#%%
positions
#%%
df = pd.DataFrame(positions)
df.shape
#%%
df.head()
#%%
df.to_csv('data_mining_hz.csv')
#%%
######爬取豆瓣投票250
import requests
import json
class DoubanBook_Spider():
def __init__(self):
self.url_temp_list = [
{"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_fiction/items?start=0&count=18&loc_id=0",
"book":"fiction"},
{"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_nonfiction/items?start=0&count=18&loc_id=0",
"book":"nofiction"},
{"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_classic/items?start=0&count=18&loc_id=0",
"book":"classic"}
]
self.headers={
"Referer": "https://m.douban.com/book/classic",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}
def parse_url(self,url):# 发送请求,获取响应
print(url)
response = requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,json_str):# 提取数据
dict_ret = json.loads(json_str)
content_list = dict_ret["subject_collection_items"]
total = dict_ret["total"]
return content_list,total
def save_content_list(self,content_list,book):
with open("book_list.txt","a",encoding="utf-8")as f:
for content in content_list:
content["book"]= book
f.write(json.dumps(content, ensure_ascii=False))
f.write("\n") # 写入换行符进行换行
print("保存成功")
def run(self):
for url_temp in self.url_temp_list:
num = 0
total = 100 # 假设有第一页
while num < total + 18:
# 1.start_utl
url = url_temp["url_temp"].format(num)
# 2.发送请求,获取响应
json_str = self.parse_url(url)
# 3.提取是数据
content_list, total = self.get_content_list(json_str)
# 4.保存
self.save_content_list(content_list, url_temp["book"])
# if len(content_list)<18:
# break
# 5.构造下一页的url地址,进入循环
num += 18
if __name__ == "__main__":
douban_spider = DoubanBook_Spider()
douban_spider.run()