简单的了解爬虫

数据的获取

1、数据简单请求:

import requests
url = 'https://www.baidu.com'
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}
data = requests.get(url, headers=headers)
print(data.test)

2、解析数据:

from bs4 import BeautifulSoup
soup = BeautifulSoup(data.text, 'lxml)
print(soup)

soup主要有:soup.find(),soup.find_all()

3、数据的存储:

import pandas as pd
result = pd.DataFrame()         //创建空的DataFrame数据框,便于数据的存储
result['img_ulr'] = img_urls   //将对应的数据填充到数据框
result.to_csv('result.csv,index=None)    //存储文件里

4、异常处理:

try:
except .... :
finally:

广义的异常处理:
def get_data(url, num_retries=3):
    if (data !=None) and (500 <= data.status_code<600):
        if (num_retries > 0):
            print("服务器错误,正在重试。。。")
            time.sleep(1)
            num_retries -= 1
            get_data(url, num_retries)
说明:当访问的服务器异常时,可以再次重试访问。

5、动态UA:

from fake_useragent import UserAgent 
ua = fake_useragent.UserAgent()
ua.chrome
ua.ie
ua.random   // 火狐浏览器

6、编码解决:

import chardet
data = request.get()
charset = chardet.detect(data.content)    //检测编码
data.encoding = charset['encoding']       //指定编码
print(data.text)

7、总结前面的:

  import re
  import time
  import chardet
  import requests
  import urllib.robotparser
  from fake_useragent import UserAgent


  # 获取headers
  def get_headers():
      ua = UserAgent()
      user_agent = ua.random
      headers = {'User-Agent': user_agent}

      return headers


  # 这里获取代理IP的函数直接给出了proxies,
  # 我们也可以用此函数去爬取免费的代理IP,因为不是重点,这里不再赘述
  def get_proxies():
      proxies = {
          "http": "125.88.74.122:84",
          "http": "123.84.13.240:8118",
          "https": "94.240.33.242:3128"
      }

      return proxies


  # robots.txt检测
  def robot_check(robotstxt_url, headers, url):
      rp = urllib.robotparser.RobotFileParser()
      rp.set_url(robotstxt_url)
      rp.read()
      result = rp.can_fetch(headers['User-Agent'], url)

      return result


  # 获取网页数据, 这里我们没有返回data.text,
  # 因为抓取图片图片时返回的应该是data.content
  def get_data(url, num_retries=3, proxies=None):
      try:
          data = requests.get(url, timeout=5, headers=headers)
          print(data.status_code)
      except requests.exceptions.ConnectionError as e:
          print("请求错误, url:", url)
          print("错误详情:", e)
          data = None
      except:  # other error
          print("未知错误, url:", url)
          data = None

      if (data != None) and (500 <= data.status_code < 600):
          if (num_retries > 0):
              print("服务器错误,正在重试...")
              time.sleep(1)
              num_retries -= 1
              get_data(url, num_retries, proxies=proxies)

      return data


  # 对网页内容的解析,提取和存储等操作
  def parse_data(data):
      if data == None:
          return None

      charset = chardet.detect(data.content)
      data.encoding = charset['encoding']
      html_text = data.text
      '''
      对网页数据的解析提取等操作,假设这里要获取网页的title
      '''
      interesting_data = re.findall('<title>(.*?)</title>', html_text)

      return interesting_data


  if __name__ == '__main__':
      headers = get_headers()
      proxies = get_proxies()
      data = get_data("http://www.baidu.com", num_retries=3, proxies=proxies)
      interesting_data = parse_data(data)
      print(interesting_data)

8、模拟登陆:

首先经过表单登陆,获取Cookie,然后将其保存到文件,之后直接从文件获取cookie,传给session对象,进行后续的数据获取。

9、验证码问题:

  1. 手动输入的方式
  2. pytesseract的方式
  3. 云大码平台
做成一个文件,用于后面的调取使用:

  import json
  import time
  import requests

  def getcode_from_yundama():

      captcha_username = '你的用户名'
      captcha_password = '你的密码'
      captcha_id = 1
      captcha_appkey = '你的KEY'
      captcha_codetype = '3000'
      captcha_url = 'http://api.yundama.com/api.php?method=upload'
      captcha_result_url = 'http://api.yundama.com/api.php?cid{}&method=result'
      filename = 'douban.jpg'
      timeout = 30

      postdata = {'method': 'upload', 'username': captcha_username,
                  'password': captcha_password, 'appid': captcha_id,
                  'appkey': captcha_appkey, 'codetype': captcha_codetype,
                  'timeout': timeout}

      fo = open(filename, 'rb')
      file = {'file': fo.read()}
      response = requests.post(captcha_url, postdata, files=file).text
      print(response)
      fo.close()

      response = json.loads(response)
      code = response['text']
      status = response['ret']
      if status == 0:
          print("识别成功!")
          print('验证码为:', code)

      return code

10、动态加载内容:

  1. 直接获取:根据正则表达式获取(re.findall())

    re_data = re.findall(‘pcMiaoShaAreaList(({.*}))’, data.text)[0]

    json_data = json.loads(re_data)

  2. 通过Selenium获取。

    无头浏览器方式:
    import time
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities    //定制请求头
    
    def run():
        login_url = 'https://accounts.douban.com/login'  # 要打开的页面
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
        driver = webdriver.PhantomJS('/home/shensir/phantomjs-2.1.1-linux-x86_64/bin/phantomjs',
                                     desired_capabilities=dcap)
        driver.get(login_url)  # 打开网页
        time.sleep(5)  # 等待5s,使得网页加载完全
    
        # 获取登录页面的初始图片
        driver.get_screenshot_as_file('before-login.png')
    
        # html = driver.page_source  # 获取当前网页源码
        # print(html)
    
        # 填写帐号密码登录
        driver.find_element_by_xpath('//*[@id="email"]').send_keys('你的帐号')
        driver.find_element_by_xpath('//*[@id="password"]').send_keys('你的密码')
    
        time.sleep(3)
        # 获取填写信息后的页面
        driver.get_screenshot_as_file('after-insert.png')
    
        # 点击登录
        driver.find_element_by_xpath('//*[@id="lzform"]/div[6]/input').click()
        # 查看登陆后的界面
        time.sleep(3)
        driver.get_screenshot_as_file('after-login.png')
    
        '''
        进行一些登录后的操作
        html = driver.get('http://...')
        getdata(html)
        '''
    
        # 若程序异常中断,driver不会自动释放
        # 所以实际使用时最好就上异常处理,保证driver的释放
        driver.quit()
    
    
    if __name__ == '__main__':
        run()
    
    

11、多线程和多进程:

下面根据代码进行认识:

    import time
    import requests
    import concurrent
    from concurrent import futures
    import pandas as pd
    import threading
    from multiprocessing import Pool


    # 装饰器,打印函数的执行时间
    def gettime(func):
        def warapper(*args, **kwargs):
            print("=" * 50)
            print(func.__name__, 'Start...')
            starttime = time.time()
            func(*args)
            endtime = time.time()
            spendtime = endtime - starttime
            print(func.__name__, "End...")
            print("Spend", spendtime, "s totally")
            print("=" * 50)

        return warapper


    # 从文件取n个网址测试
    def get_urls_from_file(n):
        df = pd.read_csv('TestUrls.csv')  # 共1000个网址
        urls = list(df['url'][:n])

        return urls


    # 请求并解析网页获取数据(这里简单把要获取的数据设为网页源码)
    def getdata(url, retries=3):
        # print("正在下载:", url)
        headers = {}
        try:
            html = requests.get(url, headers=headers)
            # print(html)

        except requests.exceptions.ConnectionError as e:
            # print('下载出错[ConnectionError]:', e)
            html = None

            # 5xx 错误为服务器错误,我们可以进行重新请求
        if (html != None and 500 <= html.status_code < 600 and retries):
            retries -= 1
            # print('服务器错误正在重试...')
            getdata(url, retries)
            data = html.text
        else:
            data = None

        return data


    # 串行
    @gettime
    def Mynormal():
        for url in urls:
            getdata(url)


    # 进程池
    @gettime
    def MyprocessPool(num=10):
        pool = Pool(num)
        results = pool.map(getdata, urls)

        pool.close()
        pool.join()
        return results


    # 多线程
    @gettime
    def Mymultithread(max_threads=10):
        # 对urls的处理
        def urls_process():
            while True:
                try:
                    # 从urls末尾抽出一个url
                    url = urls.pop()
                except IndexError:
                    # urls爬取完毕,为空时,结束
                    break
                data = getdata(url, retries=3)
                '''
                这里是对网页数据的提取与存储操作
                '''

        threads = []

        # 未达到最大线程限制且仍然存在带爬取的url时,可以创建新的线程进行加速
        while int(len(threads) < max_threads) and len(urls):
            thread = threading.Thread(target=urls_process)
            # print('创建线程', thread.getName())
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()


    # 线程池
    @gettime
    def Myfutures(num_of_max_works=10):
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_of_max_works) as executor:
            executor.map(getdata, urls)


    if __name__ == '__main__':
        #  取100个网页做测试
        urls = get_urls_from_file(100)
        Mynormal()  # 串行
        MyprocessPool(10)  # 进程池
        Myfutures(10)  # 线程池
        Mymultithread(10)  # 多线程
       
      
'''

100个网页

==================================================
Mynormal Start...
Mynormal End...
Spend 20.605727672576904 s totally
==================================================
==================================================
MyprocessPool Start...
MyprocessPool End...
Spend 2.4525890350341797 s totally
==================================================
==================================================
Mymutithread Start...
Mymutithread End...
Spend 2.1947641372680664 s totally
==================================================
==================================================
Myfutures Start...
Myfutures End...
Spend 2.1515889167785645 s totally
==================================================

'''

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值