基础爬虫常用代码块

Yooyi_xin

已于 2022-05-11 19:53:42 修改

阅读量984

点赞数

分类专栏： python 文章标签：爬虫前端 python

于 2022-01-14 21:52:53 首次发布

本文链接：https://blog.csdn.net/Yooyi_xin/article/details/122502873

版权

python 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

整理一些平时爬虫常用的代码块。不断更新中……

selenium

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

option = Options()
option.add_argument('--disable-blink-features=AutomationControlled')  # 关闭网页对自动化浏览器的监测
option.add_argument('--disable-gpu')  # 停用gpu渲染
option.add_argument("--headless")  # 无头浏览器
option.add_argument('--no-sandbox')  # 在服务器运行的时候，必须加这行设置

web = Chrome(options=option)
# web = Chrome(options=option,executable_path='/usr/bin/chromedriver') # 如果没有将chromedriver设置环境变量 需要添加chromedriver的绝对路径或相对路径

web.maximize_window()  # 浏览器窗口最大化

# 拖动浏览器滑动条
js = "var q=document.documentElement.scrollTop=1000000"
web.execute_script(js)

# 将页面拖动到指定id的元素位置
web.execute_script('document.getElementById("mainsrp-pager").scrollIntoView(false);')

# 打开新的页面
url = 'https://www.baidu.com'
web.execute_script(f'window.open("{url}")')  # 打开url

# 切换窗口
web.switch_to.window(web.window_handles[-1])  # 切换为最后一个串口
web.switch_to.window(web.window_handles[0])  # 切换为最前面的窗口

爬虫自省

# 如果爬取失败，则可以自动重新爬取

url = 'https://www.baidu.com'

## 第一种使用while循环
num = 0
while  num < 5:
	try:
		resp = requests.get(url)
		break
	except:
		num += 1
		pass
		
## 第二种使用for循环
for i in range(5):
    try:
        resp = requests.get(url)
        break
    except:
        if i == 4:
            with open('erro_log.txt',mode="a+",encoding='utf-8') as f:
                f.write("出错信息：" + "url" + '\n')
        pass

邮件提醒

"""
这是我自己平时用的。放在服务器运行的时候，
有时候不确定什么时候任务就完成了，
然后写了这么一个小功能，
用于任务完成时的任务提醒
"""

import smtplib
from email.mime.text import MIMEText  # 邮件正文
from email.header import Header

def send_email(self):
	
	# 登录邮件服务器
	smtp_obj = smtplib.SMTP_SSL("smtp.qq.com", 465)  # 发件人邮箱中的SMTP服务器，端口是465  我是以qq邮箱为例进行设置的
	smtp_obj.login("发件人邮箱地址", "邮箱密码")  
	# smtp_obj.set_debuglevel(1) #显示调试信息
	# 设置邮件头信息
	msg = MIMEText("任务已完成", "plain", 'utf-8')
	msg["From"] = Header("任务已完成", "utf-8")  # 发送者
	msg["To"] = Header("任务已完成", "utf-8")  # 接收者
	msg["Subject"] = Header("任务已完成", "utf-8")
	# 发送
	smtp_obj.sendmail("发件人邮箱", ["收件人邮箱"], msg.as_string())

时间戳

improt time

# 十三位时间戳
# python中的时间戳默认是十位，但是JavaScript使用的时间戳多为十三位
data = int(time.time() * 1000)

# 格林威治时间

import datetime

a = 'Fri Apr 01 21:45:44 +0800 2022'
b = "%a %b %d %H:%M:%S %z %Y"
c = datetime.datetime.strptime(a,b)
print(str(c))

# 输出结果
# 2022-04-01 21:45:44+08:00

保存文件


import csv
import json

# 将字典保存为csv文件
with open(文件名.csv,mode='w',enconding='utf-8',newline='') as f:
	dic = {}
	csvwriter = csv.writer(f)
	csvwriter.writerow(dic.values())

# 将字典直接保存在文件中
with open(文件名.json,mode='w',enconding='utf-8') as f:
	dic = {}
	temp_dic = json.dumps(dic,ensure_ascii=False)  # ensure_ascii=False 将文字直接保存在文件中
	f.write(temp_dic)
	f.write('\n')

特殊字符处理

# 在python中处理字符串时，会因为编码问题导致python将字符串中的反斜杠识别为转义字符，
# 由此会在字符串保存过程中因为编码问题导致程序出错
a = '\ud83d'
# 处理方法
print(repr(a)) # 将a中的数据完全转化为字符串，而不将反斜杠识别为转义符

无限debug

// 适用于使用构造方法生成的无限debug
// 浏览器控制台复制以下代码运行

var _constructor = constructor
Function.prototype.constructor = function(s){
    if(s="debugger"){
        return null;
    } 
    return _constructor(s);

pandas保存excel

import pandas as pd

content_list = []  # 将字典、列表等进行存入
df = pd.DataFrame(content_list)

df.to_excel("test.xlsx", index=True, encoding="utf-8")

pandas将数据保存在同一个excel中的不同sheet

import pandas as pd

content_list_1 = []  # 将字典、列表等进行存入
content_list_2 = []  # 将字典、列表等进行存入

# 第一种方法
xlsx = pd.ExcelWriter('test_1.xlsx')

df = pd.DataFrame(content_list_1)
df.to_excel(xlsx,sheet_name='sheet1')

df_2 = pd.DataFrame(content_list_2)
df_2.to_excel(xlsx,sheet_name='sheet2')

xlsx.close()


# 第二种方法 类似于文件操作

with pd.ExcelWriter('test_1.xlsx') as xlsx：

  df = pd.DataFrame(content_list_1)
  df.to_excel(xlsx,sheet_name='sheet1')
  
  df_2 = pd.DataFrame(content_list_2)
  df_2.to_excel(xlsx,sheet_name='sheet2')

Yooyi_xin

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
基础爬虫常用代码块

整理一些平时常用的代码块。seleniumfrom selenium.webdriver import Chromefrom selenium.webdriver.chrome.options import Optionsoption = Options()option.add_argument('--disable-blink-features=AutomationControlled') # 关闭网页对自动化浏览器的监测option.add_argument('--disable-gp
复制链接

扫一扫

专栏目录