requests 页面级爬虫习惯用法 及 实例

util.py文件

import requests

def get(url, params=None, cookie=None, headers=None, proxies=None):
    '''
    此方法用于发起get请求
    :param url:
    :param params:
    :param cookie:
    :param headers:
    :param proxies:
    :return:
    '''
    s = requests.session()
    try:
        if params != None:
            s.params = params
        if cookie != None:
            s.cookies = cookie
        if headers != None:
            s.headers = headers
        if proxies != None:
            s.proxies = proxies
        r = s.get(url=url, timeout=20)
        return (1, r.content)
    except Exception as e:
        print(e)
    finally:
        if s:
            s.close()
    return (0,)


def post(url, data, params=None, cookie=None, headers=None, proxies=None):
    '''
    此方法用于发起post请求
    :param url:
    :param params:
    :param cookie:
    :param headers:
    :param proxies:
    :return:
    '''
    s = requests.session()
    try:
        if params != None:
            s.params = params
        if cookie != None:
            s.cookies = cookie
        if headers != None:
            s.headers = headers
        if proxies != None:
            s.proxies = proxies
        r = s.post(url=url, data=data, timeout=20)
        return (1, r.content,r.cookies)
    except Exception as e:
        print(e)
    finally:
        if s:
            s.close()
    return (0,)

爬取实例 (guazi):

from   xxx   import util
import re
from lxml import etree
from bs4 import BeautifulSoup

head = {"Cookie": "antipas=2W192MJ893976a23019W485050817",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Host": "www.guazi.com",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

r = util.get("https://www.guazi.com/sjz/dazhong/", headers=head)
if r[0] == 1:
    body = r[1].decode()

#bs4 做法

 soup = BeautifulSoup(body, 'lxml')
    all=soup.find_all('a', class_="car-a")
    for item in all:
        href=item.attrs["href"]
        title=item.attrs["title"]
        gl=item.contents[7].text.split('|')[1]
        price=item.contents[9].contents[3].text

xpath做法

 html=etree.HTML(body)
    # data=html.xpath('//ul[@class="carlist clearfix js-top"]/li')
    # for item in data:
    #     href=item.xpath('a/@href')[0]
    #     title=item.xpath('a/@title')[0]
    #     gl=item.xpath('a/div[1]/text()')[1]
    #     price=item.xpath('a/div[2]/p/text()')[0]

正则匹配

body = r[1].decode().replace("\n", "").replace("\r", "").replace("\t", "")
    # com=re.compile('<li data-scroll-track=.*?href="(.*?)".*?"t">(.*?)</h2>.*?</span>(.*?)</div>.*?<p>(.*?)</p>')
    # data=com.findall(body)
    # for item in data:
    #     print(item[0])
    #     print(item[1])
    #     print(item[2])
    #     print(item[3])
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值