反爬
请求头的user_agent字段都含有“Scrapy”关键字,很容易被识别出来是爬虫,所以我们可以伪装成浏览器。通过浏览器随意打开一个网站,在开发者工具里面可以看到这个浏览器的user_agent。
整体代码如下:
import time
from selenium import webdriver
from models import *
def get_driver():
options = webdriver.ChromeOptions()
# options.add_argument('--blink-settings=imagesEnabled=false')
# options.add_argument('--headless')
options.add_argument('--window-size=1440,1080')
options.add_argument('--disable-extensions')
options.add_argument('--no-sandbox') # run Chrome use root
options.add_argument('--disable-setuid-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')
driver = webdriver.Chrome(options=options)
return driver
def get_data(driver, keys, main_key):
url = f'https://xxxx.com/en-us/s/{keys}?businessesPage={p}&query={main_key}'
driver.get(url)
tt = driver.find_element_by_class_name('purify_1C2sKfbn9OVsW').find_elements_by_css_selector(
".purify_T9Yll5MsAPX > a")
ad = driver.find_element_by_css_selector(
"[class='purify_X purify_2SvKv3MmhxAqq-wQiZJQc3 purify_g4p_j_IN6T']")
ff = ad.find_element_by_css_selector(
"[class='purify_1sQU5pf3yAvt purify_3k1NnTEGO6TSunXbY5Zrkx']").text
.......
if __name__ == '__main__':
main_key = ''
keys_list = ["a", "b"]
driver = get_driver()
for k in keys_list:
get_data(driver, k, main_key)
import time
from selenium import webdriver
from models import *
def get_driver():
options = webdriver.ChromeOptions()
# options.add_argument('--blink-settings=imagesEnabled=false')
# options.add_argument('--headless')
options.add_argument('--window-size=1440,1080')
options.add_argument('--disable-extensions')
options.add_argument('--no-sandbox') # run Chrome use root
options.add_argument('--disable-setuid-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')
driver = webdriver.Chrome(options=options)
return driver
def get_data():
driver = get_driver()
url = f'https://www.baidu.com/'
driver.get(url)
......
if __name__ == '__main__':
get_data()
数据库
#写入字典
datalist.append(dict(name=name, phone=phone, address=address, source="aaa"))
#遍历字典,数据库不存在该name就写入数据库
for d in datalist:
find_data = SQLsession.query(Infos).filter_by(name=d['name']).first()
if not find_data:
SQLsession.add(Infos(**d))
SQLsession.commit()
models
orm
from sqlalchemy import *
import pymysql
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime
database = 'mysql+pymysql://root:密码@ip地址或localhost/数据库名?charset=utf8mb4'
Base = declarative_base()
# 创建数据库连接对象
engine = create_engine(database)
DBSession = sessionmaker(bind=engine)
SQLsession = DBSession()
# ORM
class Infos(Base):
__tablename__ = '表名'
id = Column(Integer(), primary_key=True)
code = Column(String(255))
name = Column(String(255))
status = Column(Integer(), default=1)
remark = Column(Text)
created = Column(DateTime, default=datetime.now())
updated = Column(DateTime, default=datetime.now(), onupdate=datetime.now())
Base.metadata.create_all(engine)
随机代理IP
对于封IP的网站,延时的方法使得爬虫效率大大下降。更好的办法是使用IP代理,每次向服务器请求使用不同的IP。