scrapy_util

selenium

# 向下滑动一屏幕
# bro 浏览器对象
bro.execute_script()
jscode = window.scrollTo(0,document.body.scrollHeight) 


# 无头浏览器
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 规避检测
from selenium.webdriver import ChromeOptions
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(chrome_options= chrome_options,options=options)

下载中间件

设置拦截请求 或 代理IP 或 拦截响应 : 最后都需要在settings中 注释开 下载中间件管道

  • 拦截请求

    user_agent_list = [
                        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
                        ".",
                        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
                        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
                        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
                        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
                        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
                        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
                        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
                        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
                        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
                        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
                        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
                        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
                        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
                        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
                        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
                        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
                        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
                    ]
    # 在 process_requests 处理请求函数中配置进行配置
        def process_request(self, request, spider):
            request.headers['User-Agent'] = random.choice(self.user_agent_list)
    
  • 代理ip

    PROXY_http = [
      '153.180.102.104:80',
      '195.208.131.189:56055',
    ]
    PROXY_https = [
      '120.83.49.90:9000',
      '95.189.112.214:35508',
    ]
    # 在 process_exception 处理异常函数中进行配置
    def process_exception(self, request, exception, spider):
      	if request.url.split(":")[0] == "http":
        	request.meta['proxy'] = "http://" + random.choice(self.PROXY_http)
        else:
          request.meta['proxy'] = "https://" + random.choice(self.PROXY_https)
          return request
    
  • 拦截响应

    # 通常用于动态加载 ajax ,使用 scrapy 和 selenium 操作
    # 1.在 process_response 中设置,导入 HtmlResponse:
                def process_response(self, request, response, spider):
                  # 用于拦截特定的响应
                  bro = spider.bro
                  if request.url in spider.model_urls:
                    # 篡改响应数据
                    bro.get(request.url)
                    page_source = bro.page_source
                    response = HtmlResponse(url=request.url, body=page_source, encoding='utf-8', request=request)
                    return response
                  else:
                    return response
    # 2.在爬虫文件中
      # - 构造 selenium 对象,
      # - 创建关闭方法:
                  def close(self,spider):
                    self.bro.quit()
    #  3.在管道文件中
    #   注释开 下载中间件         
    

多个item操作

# 有时需要多个item 保存不同数据,
# 在pipelines 中的操作
def process_item(self, item, spider):
  if item.__class__.__name__ =='QuotesproItem':
    author = item['author']
    quote = item['quote']
  else:
    detail_desc = item['detail_desc']
    detail_author = item['detail_author']

scrapy模拟登录

  • 使用Requests cookies 模拟登录

    # 操作:
    # - 创建一个scrapy项目,修改配置
    # - 创建爬虫文件
    # - 重写start_requests方法(直接粘贴复制过去,改一下cookie值)
        def start_requests(self):
            # 登录之后用 chrome 的 debug 工具从请求中获取的 cookies
            cookiesstr = "实际登录网页的cookie值"
            cookies = {i.split("=")[0]:i.split("=")[1] for i in cookiesstr.split("; ")}
    
            # 携带 cookies 的 Request 请求
            yield scrapy.Request(
                self.start_urls[0],
                callback=self.parse,
                cookies=cookies
            )      
    # - 运行爬虫文件(其实就是那么简单,但是我喜欢这种方式,实用)
    # - 查看cookies 传递过程
    #         在 settings 配置中加 COOKIES_DEBUG = True
    
  • 使用Requests post 模拟登录

    # - 创建项目
    # - 创建爬虫文件
    # - 发送data 数据
       def parse(self, response):
           # request header
           postData = {
               'ck': '',
               'name': '****',      # 用户名
               'password': '****',  # 密码
               'remember': 'false',
               'ticket': ''
           }
           return [FormRequest.from_response(
                         response,
                         formdata = postData,
                         callback = self.after_login, 
                         dont_filter = True)]
    # - 写 after_login 方法 进行数据解析
    # - 执行爬虫文件    
    

如何发送post 请求

def __init__(self):
  self.headers = {
  "User-Agent":"自己找个UA",
  "X-Requested-With":"XMLHttpRequest"
  }
# 由于这是一个post 请求,
# 需要重写start_requests() 方法
def start_requests(self):
  for page in range(0,7):
    form_data = {
    "otype": "4",
    "city":"",
    "start":str(25*page),
    "amount": "25"
    }
    request = 			FormRequest(self.start_urls[0],headers=self.headers,formdata=form_data,callback=self.parse)
    yield request
def parse(self, response):

    pass

爬虫文件运行的两种方式

运行爬虫文件

  • 方式一指令运行 scrapy crawl 爬虫文件名

  • 方式二在项目根目录创建一个begin.py 文件:****

    from scrapy import cmdline

    cmdline.execute((“scrapy crawl 爬虫文件名”).split())

Pipelines 存储方式

  • csv

    fp = None
    fw = None
    def open_spider(self,spider):
        self.fp = open('./yunwoke.csv','wt',encoding='utf-8')
        self.fw = csv.writer(self.fp)
        self.fw.writerow(['needTitle', 'needDetail', 'many', 'timer', 'publicTime', 'status',])
    def process_item(self, item, spider):
        self.fw.writerow()
        return item
    def close_spider(self,spider):
        self.fp.close()
    
  • mysql

conn = None
cursor = None
def open_spider(self,spider):
    self.conn = pymysql.connect(
        host="127.0.0.1",
        port=3306,
        user='root',
        password='034312',
        db='',
        charset='utf8'
    )
def process_item(self, item, spider):
    try:
        self.cursor.execute(
            'insert into yunwoke(needTitle,needDetail,many,timer,publicTime,status) values (%s,%s,%s,%s,%s,%s)',
            [needTitle,
                needDetail,
                many,
                timer,
                publicTime,
                status,]
                            )
        self.conn.commit()
    except Exception as e:
        self.conn.rollback()
        print(e)
    return item
def close_spider(self,spider):
    self.cursor.close()
    self.conn.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值