爬虫流程整理
1.获取网络数据 - request selenium
找到目标网站 - 直接用request ,这是请求头 - user-agent - cookie -- 设置代理
请求拒绝 - 使用 - selenium - cookie - 设置代理
2.解析数据
正则表达式。css选择器(bs4,pyquery)。xpath
3.保存数据
csv、excel
"""
import requests
url = 'https://www.pinduoduo.com/'
response = requests.get(url)
print(response.status_code)
excel文件写操作
import openpyxl
work_book = openpyxl.load_workbook('files/test2.xlsx')
all_names = work_book.sheetnames
print(all_names)
sheet = work_book['Student']
cell1 = sheet.cell(1, 1)
cell2 = sheet['B1']
cell2.value = ''
work_book.save('./files/test2.xlsx')
excel文件读操作
import openpyxl
wb = openpyxl.load_workbook('files/test1.xlsx')
sheet = wb.active
cells = sheet.iter_rows(1, 4, 1, 4)
print(list(cells))
row_4 = sheet.iter_rows(4, 4)
print(list(row_4))
cells = sheet.iter_rows(2, 4, 1, 2)
print(list(cells))
cells = sheet.iter_cols(1, 4, 1, 4)
print(list(cells))
all_scores = sheet.iter_cols(4, 4, 2, 4)
for score_cell in next(all_scores):
print(score_cell.value)
51job数据分析岗位爬取
import requests
from re import findall
from json import loads
import time
import os
import openpyxl
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
}
def get_one_page(page):
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_data = findall(r'window.__SEARCH_RESULT__\s*=\s*(\{.+?\})</script>', response.text)[0]
return loads(json_data)['engine_search_result']
else:
print('请求失败!')
def get_all_data():
all_data = []
page = 1
while True:
result = get_one_page(page)
if not result:
print('没有更多数据')
break
page += 1
save_page_data(result)
print(f'获取第{page}页数据成功!')
time.sleep(1)
def get_work_book():
if os.path.exists('files/招聘信息.xlsx'):
wb = openpyxl.load_workbook('files/招聘信息.xlsx')
else:
wb = openpyxl.Workbook()
names = wb.sheetnames
if '数据分析' in names:
sheet = wb['数据分析']
else:
sheet = wb.create_sheet('数据分析')
titles = ['岗位名称', '薪资', '公司名称', '公司性质', '公司地址', '要求', '福利']
for col in range(1, len(titles) + 1):
sheet.cell(1, col).value = titles[col - 1]
return wb, sheet
def save_page_data(data: list):
row = sheet.max_row + 1
for job in data:
job_info = [
job.get('job_name', ''),
job.get('providesalary_text', ''),
job.get('company_name', ''),
job.get('companytype_text', ''),
job.get('workarea_text', ''),
'/'.join(job.get('attribute_text', ['-', '-', '-', '-', '-'])),
job.get('jobwelf', '')
]
for col in range(1, len(job_info)+1):
sheet.cell(row, col).value = job_info[col-1]
print(job)
row += 1
wb.save('files/招聘信息.xlsx')
if __name__ == '__main__':
wb, sheet = get_work_book()
get_all_data()
selenium 设置选项
from selenium import webdriver
url = 'https://www.jd.com'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = webdriver.Chrome(options=options)
b.get(url)
print(b.page_source)
frame 切换
from selenium import webdriver
url = 'https://mail.163.com/'
b = webdriver.Chrome()
b.get(url)
"""
有的时候会遇到这样的网页:一个网页对应的html标签嵌套了其他的html标签
(前端如果要实现嵌套的功能必须要将被嵌套的html放在iframe标签中),
如果需要爬取网页内容在嵌套的html里面,需要先让浏览器选中内容嵌套的html。
(浏览器对象默认选中的是最外面的html标签)
"""
box = b.find_element_by_css_selector('#loginDiv>iframe')
b.switch_to.frame(box)
print(b.page_source)
51job
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
url = 'https://www.51job.com/'
b = webdriver.Chrome()
b.get(url)
input = b.find_element_by_css_selector('#kwdselectid')
input.send_keys('数据分析')
input.send_keys(Keys.ENTER)
allcity = b.find_element_by_css_selector('.allcity')
allcity.click()
print(b.page_source)
beijing = b.find_element_by_css_selector('.panel_lnp.panel_py.panel_ct2 .de.d3 tbody tr td em')
print(beijing.text)
beijing.click()
sure = b.find_element_by_css_selector('#popop > div > div.but_box > span')
sure.click()
线程与进程
"""
Time:2021/6/1 17:03
Author:Spectre
"""
"""
如果CPU调度(切换)足够快,就造成了多线程并发执行的假象
线程很多的情况下,会消耗大量CPU资源
"""
import time
from datetime import datetime
from threading import Thread
def download(name):
print(f'{name}开始下载:{datetime.now()}')
time.sleep(2)
print(f'{name}结束下载:{datetime.now()}')
download('肖申克的救赎')
download('霸王别姬')
download('阿甘正传')
t1 = Thread(target=download,args=('肖申克的救赎',))
t2 = Thread(target=download,args=('霸王别姬',))
t3 = Thread(target=download,args=('阿甘正传',))
t1.start()
t2.start()
t3.start()
homework
"""
Time:2021/6/1 19:12
Author:Spectre
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
import time
from datetime import datetime
from threading import Thread
def get_net_data():
global b
def an_data(data):
soup = BeautifulSoup(data,'lxml')
li_list = soup.select('.j_joblist>.e')
all_item = []
for li in li_list:
item = {}
name_list = li.select_one('.jname.at').attrs['title']
item['name'] = name_list
a_list = li.select_one('a').attrs['href']
item['link'] = a_list
time_list =li.select_one('.time').get_text()
item['time'] = time_list
price_list = li.select_one('.sal').get_text()
item['price'] = price_list
info_list = li.select_one('.d.at').get_text().replace(' ','')
item['info'] = info_list
tags_list =li.select_one('.tags')
tags_list = li.select_one('.tags').attrs['title'] if tags_list else ''
item['tags'] = tags_list
co_name_list = li.select_one('.er>a').get_text()
item['co_name'] = co_name_list
co_link_list = li.select_one('.er>a').attrs['href']
item['co_link'] = co_link_list
co_type_list = li.select_one('.er>.dc.at').get_text()
item['co_type'] = co_type_list
co_intro_list = li.select_one('.er>.int.at').get_text()
item['co_intro'] = co_intro_list
all_item.append(item)
print(all_item)
return all_item
def download(str):
print(f'开始下载:{datetime.now()}{str}')
time.sleep(2)
print(f'结束下载:{datetime.now()}')
if __name__ == '__main__':
t = []
b = webdriver.Chrome()
for i in range(10):
b.get(f'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{i+1}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')
t.append(Thread(target=download, args=(an_data(b.page_source),)))
for x in t:
x.start()