1.豆瓣top250
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
def main():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
for i in range(0, 20):
url = f'https://book.douban.com/tag/%E5%95%86%E4%B8%9A?start={i*20}&type=S'
response = requests.get(url=url, headers=headers, timeout=30)
html_content = response.text
soup = BeautifulSoup(html_content, 'lxml')
p_html(soup)
print(f'第{i}页爬取完毕')
print(url)
data_info = {'图书名称':[], '图书作者':[], '图书出版时间':[], '评分':[], '评价人数':[], '简介':[]}
def p_html(soup):
li_list = soup.select('.subject-list li')
for li in li_list:
name = re.findall(r'<div class="info">.*?title="(.*?)".*?</a>', str(li), re.S)[0]
data_info['图书名称'].append(name)
data_info['图书作者'].append(li.select('.info .pub')[0].text.split('/')[0].strip())
data_info['图书出版时间'].append(li.select('.info .pub')[0].text.split('/')[-2].strip())
data_info['评分'].append(float(li.select('.info .star span')[1].text.strip()))
data_info['评价人数'].append(int(li.select('.info .star span')[2].text.replace('(', '').replace('人评价)', '').strip()))
data_info['简介'].append(str(li.find('p')).replace('<p>', '').replace('</p>', ''))
return data_info
if __name__ == '__main__': # 调用函数
main()
book_info = pd.DataFrame(data_info)
print(book_info.isnull())
print(book_info.duplicated())
book_info = book_info.dropna()
book_info.to_excel('豆瓣图书排名.xlsx', encoding='utf-8', index=False)
2.boss直聘
from selenium import webdriver
from urllib import parse
import pandas as pd
datalist = {'工作名称':[],
'工作地点':[],
'薪资待遇':[],
'经验学历':[],
'技能标签':[],
'公司名称':[],
'公司类型':[],
'福利待遇':[],
'招聘链接':[]}
def get_job_info():
lis = drive.find_elements_by_css_selector('.job-list li')
for li in lis:
# 招聘链接
link = li.find_element_by_css_selector('.job-name a').get_attribute('href')
datalist['招聘链接'].append(link)
# 工作名称
name = li.find_element_by_css_selector('.job-name a').text
datalist['工作名称'].append(name)
# 工作地点
area = li.find_element_by_css_selector('.job-area').text
datalist['工作地点'].append(area)
# 公司名称
company_name = li.find_element_by_css_selector('.company-text .name a').text
datalist['公司名称'].append(company_name)
# 公司类型
company_type = li.find_element_by_css_selector('.company-text p a').text
datalist['公司类型'].append(company_type)
# 薪资待遇
money = li.find_element_by_css_selector('.red').text
datalist['薪资待遇'].append(money)
# 经验学历
exp = li.find_element_by_css_selector('.job-limit p').text
datalist['经验学历'].append(exp)
# 技能标签
tags = li.find_elements_by_class_name('tag-item')
add = []
for tag in tags:
if tag.text == '':
break
add.append(tag.text)
tagadd = '/'.join(add)
datalist['技能标签'].append(tagadd)
# 福利待遇
boon = li.find_element_by_css_selector('.info-desc').text
datalist['福利待遇'].append(boon)
# print(name,area,company_name,company_type,money,exp,tags,boon)
# nonextpage = drive.find_element_by_css_selector(".page .disabled")
return datalist
if __name__ == '__main__':
drive = webdriver.Chrome()
for page in range(1, 7):
for i in range(103, 107):
drive.get(f"https://www.zhipin.com/c100010000/e_{i}/?query=数据分析&page=1&ka=page-")
drive.implicitly_wait(10)
print(f'正在爬取e_{i}第{page}页的数据内容')
get_job_info()
try:
drive.find_element_by_css_selector(".page .disabled")
break
except:
drive.find_element_by_css_selector(".next").click()
datalist = pd.DataFrame(datalist)
datalist.to_excel('boss直聘数据分析岗位全国.xlsx', encoding='utf-8', index=False)
drive.quit()
3.数据处理
import pandas as pd
boss = pd.read_excel(r"C:\Users\hwt\Desktop\数据分析求职\boss直聘数据分析岗位全国.xlsx")
city = boss['工作地点'].str.split('·').str[0]
boss.insert(loc=2, column='城市', value=city)
district = boss['工作地点'].str.split('·').str[1]
boss.insert(loc=3, column='区县', value=district)
area = boss['工作地点'].str.split('·').str[2]
boss.insert(loc=4, column='地址', value=area)
#print(city)
#print(district)
#print(area)
startsalary = boss['薪资待遇'].str.extract(r'(\d+)-').astype('int')*1000
boss.insert(loc=6, column='起薪', value=startsalary)
#print(startsalary)
exp = boss['经验学历'].str.extract(r'(\d-?\d*年)', expand = False)
boss.insert(loc=8, column='经验', value=exp)
degree = boss['经验学历'].str.extract(r'\d-?\d*[\u4e00-\u9fa5]+([\u4e00-\u9fa5]{2})', expand = False)
boss.insert(loc=9, column='学历', value=degree)
#print(exp)
#print(degree)
#print(boss['dup'])
#print(boss.info)
boss = boss.drop_duplicates(['工作名称', '薪资待遇', '公司名称'])
boss.fillna('')
boss.sort_values(by=['公司名称', '经验', '起薪'], ascending=[True, True, True], inplace= True)
boss.to_excel('boss直聘数据分析岗位全国cleaning.xlsx', encoding='utf-8', index=False)
4.正则表达式
# -*- coding = utf-8 -*-
from bs4 import BeautifulSoup # 网页解析
import re # 正则表达式,进行文字匹配
import urllib.request # 指定URL,获取网页数据
import urllib.error # 指定URL,获取网页数据
import openpyxl # 进行excel操作
import sqlite3 # 进行SQLITE数据库操作
def main():
baseurl = 'https://movie.douban.com/top250?start='
datalist = getData(baseurl) # 爬取和解析源网页的数据
saveData(datalist) # 保存数据
# 影片详情
findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式对象,表示规则(字符串的模式)
# 影片海报
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S让换行符包含在字符内
# 片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 影片评价
findJudge = re.compile(r'<span>(\d*)人评价</span>')
# 影片概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
# 影片相关内容
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
# askURL('https://movie.douban.com/top250?start=0')
def getData(baseurl): # 爬取网页
datalist = []
for i in range(0, 10): # 调用获取页面信息的函数10次
url = baseurl + str(i*25)
html = askURL(url) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_='item'): # 查找符合要求的字符串形成列表
# print(item)
data = [] # 保存一部电影的所有信息
item = str(item)
# 影片详情链接
link = re.findall(findLink, item)[0] # re库用来通过正则表达式查找指定的字符串
data.append(link)
imgSrc = re.findall(findImgSrc, item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if len(titles) == 2:
ctitle = titles[0]
data.append(ctitle)
otitle = titles[1].replace('/', '')
data.append(otitle)
else:
data.append(titles[0])
data.append(' ')
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace('。', '')
data.append(inq)
else:
data.append(' ')
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\\s+)?/>(\\s+)?', ' ', bd)
bd = re.sub('/', ' ', bd)
data.append(bd.strip()) # 去掉前后空格
datalist.append(data) # 把处理好的一部电影信息放入datalist
print(datalist)
return datalist
def askURL(url): # 得到指定一个url的网页内容
head = { # 模拟浏览器头部信息向豆瓣服务器发送消息
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'} # 用户代理表示告诉豆瓣我们是什么类型的机器、浏览器
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8') # 对获取到的网页源码进行utf-8解码
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html
def saveData(datalist): # 将数据保存到Excel
workbook = openpyxl.Workbook()
# wb=openpyxl.Workbook(encoding='utf-8') 创建一个Excel工作簿,字符编码可输出中文
worksheet = workbook.active
worksheet.title = '豆瓣电影Top250'
col = ('电影详情链接', '图片链接', '影片中文名', '影片外文名', '评分', '评价人数', '概况', '相关信息')
for i in range(len(col)):
worksheet.cell(1, i+1, col[i]) # 写入第一行数据,即列名称
for i in range(0, 250):
data = datalist[i] # 从第二行开始写入数据
for j in range(len(col)):
worksheet.cell(i+2, j+1, data[j]) # 从第二行第一列开始写入
workbook.save('豆瓣电影Top250数据.xlsx') # 将数据保存为Excel表格
if __name__ == '__main__': # 调用函数
main()
5.当当
import requests
import pandas as pd
from bs4 import BeautifulSoup
def main():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
for i in range(1, 26):
url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent30-0-0-1-{i}'
response = requests.get(url=url, headers=headers, timeout=30)
html_content = response.text
soup = BeautifulSoup(html_content, 'lxml')
p_html(soup)
print(f'第{i}页爬取完毕')
data_info = {'图书排名':[], '图书名称':[], '图书作者':[], '图书出版时间':[], '图书出版社':[], '图书价格':[]}
def p_html(soup):
li_list = soup.select('.bang_list li')
for li in li_list:
data_info['图书排名'].append(li.select('.list_num ')[0].text.replace('.', ''))
data_info['图书名称'].append(li.select('.name a')[0].text)
data_info['图书作者'].append(li.select('.publisher_info ')[0].select('a')[0].text)
data_info['图书出版时间'].append(li.select('.publisher_info span')[0].text)
data_info['图书出版社'].append(li.select('.publisher_info ')[1].select('a')[0].text)
data_info['图书价格'].append(float(li.select('.price .price_n')[0].text.replace('¥', '')))
return data_info
if __name__ == '__main__': # 调用函数
main()
book_info = pd.DataFrame(data_info)
print(book_info.isnull())
print(book_info.duplicated())
book_info = book_info.dropna()
book_info.to_excel('当当网图书销售排行.xlsx', encoding='utf-8', index=False)