python中123+5.0的执行结果_一天掌握python爬虫-CSDN博客

一天掌握python爬虫日记：

(小爬虫，NO 我们是大蜘蛛 )

数据抓取:

requests：

requests 的底层实现其实就是 urllib

开源地址：https://github.com/kennethreitz/requests

中文文档 API： http://docs.python-requests.org/zh_CN/latest/index.html

基本GET请求(headers参数和 parmas参数)：

import requests

url = "http://www.baidu.com"

kw = {'wd': '爬虫'}

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

# params 接收一个字典或者字符串的查询参数，字典类型自动转换为url编码，不需要urlencode()

# res = requests.get(url=url, params=kw, headers=headers)

# 也可以这么写

res = requests.request(method="GET", url=url, params=kw, headers=headers)

# print(res.text) # 查看响应内容，response.text 返回的是Unicode格式的数据

print(res.content.decode("utf-8")) # 查看响应内容，response.content返回的字节流数据

print(res.url) # 查看完整url地址

print(res.encoding) # 查看响应头部字符编码

print(res.status_code) # 查看响应码

StringIO与BytesIO：

# StringIO在内存中读写str。

# BytesIO 是在内存中读写bytes类型的二进制数据

# 通过requests获取网络上图片的大小

from io import BytesIO, StringIO

from PIL import Image # pip install pillow 安装

img_url = "http://imglf1.ph.126.net/pWRxzh6FRrG2qVL3JBvrDg==/6630172763234505196.png"

res = requests.get(img_url)

f = BytesIO(res.content)

img = Image.open(f)

print(img.size)

基本POST请求(data参数)

import requests

url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'

headers = {

'User-Agent': 'User-Agent:Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',

'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=',

'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.5'

}

data = {

'first': 'true',

'kd': 'python',

'pn': 1

}

resp = requests.post(url=url, data=data, headers=headers)

# print(resp.content.decode('utf-8'))

print(resp.json()) # # 如果是json文件可以直接显示,返回字典类型

代理(proxies参数)

proxies = {

'http': '39.137.2.206:8080'

}

# 如果代理需要使用HTTP Basic Auth，可以使用下面这种格式：

# proxy = { "http": "name:pass@61.158.163.130:16816" }

res = requests.get("http://httpbin.org/ip", proxies=proxies)

print(res.text)

# {

# "origin": "39.137.2.206"

# }

也可以通过本地环境变量 HTTP_PROXY 和 HTTPS_PROXY 来配置代理：

export HTTP_PROXY="http://39.137.2.206:8080"

export HTTPS_PROXY="https://12.34.56.79:9527"

web客户端验证

如果是Web客户端验证，需要添加 auth = (账户名, 密码)

import requests

auth=('test', '123456')

response = requests.get('http://192.168.199.107', auth = auth)

print (response.text)

Cookies:

res = requests.get(url="http://www.baidu.com/")

print(res.cookies) # ]>

# print(res.cookies.get_dict()) # {'BDORZ': '27315'}

cookiedict = requests.utils.dict_from_cookiejar(res.cookies)

print(cookiedict) # {'BDORZ': '27315'}

session:

url = 'http://www.renren.com/PLogin.do'

headers = {

'User-Agent': 'User-Agent:Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',

}

data = {

# 'email': 'xxxx@qq.com',

# 'password': 'xxxx'

}

session = requests.session() # 创建session对象，可以保存Cookie值

# 发送附带用户名和密码的请求，并获取登录后的Cookie值，保存在ssion里

session.post(url=url, data=data, headers=headers)

# session包含用户登录后的Cookie值，可以直接访问那些登录后才可以访问的页面

response = session.get("http://www.renren.com/410043129/profile")

print(response.text)

处理HTTPS请求 SSL证书验证:

# 跳过证书验证，把 verify 设置为 False

resp = requests.get('https://www.12306.cn/mormhweb/', verify=False)

print(resp.content.decode('utf-8'))

==========================================================================================

urllib库的基本使用:

在 python2 中，urllib 被分为urllib,urllib2等

request:

from urllib import request

# urlopen

# res = request.urlopen(url='http://www.baidu.com')

# print(res.read()) # 读取文件全部内容，返回字符串

# print(res.readlines())

# print(res.getcode()) # 200

# Request执行更复杂的操作

headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

req = request.Request(url='http://www.baidu.com', headers=headers,method='GET')

# 也可以通过调用Request.add_header() 添加/修改一个特定的header

req.add_header("Connection", "keep-alive")

# 也可以通过调用Request.get_header()来查看header信息

print(req.get_header(header_name='Connection')) # keep-alive

res = request.urlopen(req)

# print(res.read().decode())

print(res.code) # 可以查看响应状态码

# parse

from urllib import parse

url = 'http://www.baidu.com/s?'

wd = {'wd': '爬虫'}

ps = parse.urlencode(wd) # 通过urllib.urlencode()方法，将字典键值对按URL编码转换，从而能被web服务器接受

url = url + ps

print(url) # http://www.baidu.com/s?wd=%E7%88%AC%E8%99%AB

url = parse.parse_qs(url)

print(url) # {'http://www.baidu.com/s?wd': ['爬虫']}

# 通过parse.unquote方法，把 URL编码字符串，转换回原先字符串

url = parse.unquote("wd=%E7%88%AC%E8%99%AB")

print(url) # wd=爬虫

url = 'http://www.baidu.com/s;hello;123?wd=sss&name=qqq#a'

result = parse.urlparse(url)

print(result)

# ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='hello;123', query='wd=sss&name=qqq', fragment='a')

result2 = parse.urlsplit(url)

print(result2)

# SplitResult(scheme='http', netloc='www.baidu.com', path='/s;hello;123', query='wd=sss&name=qqq', fragment='a')

# 处理HTTPS请求 SSL证书验证

from urllib import request

import ssl

context = ssl._create_unverified_context() # 表示忽略未经核实的SSL证书认证

url = "https://www.12306.cn/mormhweb/"

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

req = request.Request(url, headers=headers)

# 在urlopen()方法里指明添加 context 参数

res = request.urlopen(url=req, context=context)

print(res.read().decode())

Handler处理器和自定义Opener:

from urllib import request

# 构建一个HTTPHandler 处理器对象，支持处理HTTP请求

# debuglevel=1参数，还会将 Debug Log 打开，程序在执行的时候，会把收包和发包的报头在屏幕上自动打印出来

http_handler = request.HTTPHandler(debuglevel=1)

# http_handler = request.HTTPSHandler() # 支持处理HTTPS请求

# 调用urllib.request.build_opener()方法，创建支持处理HTTP请求的opener对象

opener = request.build_opener(http_handler)

req = request.Request('http://www.baidu.com/')

# 调用自定义opener对象的open()方法，发送request请求

res = opener.open(req)

print(res.code)

ProxyHandler处理器(代理设置)：

from urllib import request

httpproxy_handler = request.ProxyHandler({'http': '39.137.2.206:8080'})

req = request.Request('http://httpbin.org/ip')

opener = request.build_opener(httpproxy_handler)

# res = opener.open(req) # 只有使用opener.open()方法发送请求才使用自定义的代理，而urlopen()则不使用自定义代理

# opener应用到全局

request.install_opener(opener)

res = request.urlopen(req)

print(res.read())

cookiejar库和 HTTPCookieProcessor处理器:

from http import cookiejar

from urllib import request

# 构建一个CookieJar对象实例来保存cookie

# cookiejar = cookiejar.CookieJar()

cookiejar = cookiejar.MozillaCookieJar('cookie.txt')

cookiejar.load(ignore_discard=True) # 读取 True 包括短暂性的cookie

# 使用HTTPCookieProcessor()来创建cookie处理器对象，参数为CookieJar()对象

handler = request.HTTPCookieProcessor(cookiejar)

opener = request.build_opener(handler)

opener.open('http://www.baidu.com')

# cookiejar.save(ignore_discard=True) # 保存

cookieStr = ""

for item in cookiejar:

print(item.name+'='+item.value)

urlretrieve:

from urllib import request

img_url = "http://n.sinaimg.cn/news/1_img/upload/cf3881ab/138/w1535h1003/20181029/MOVg-hnaivxq1076478.jpg"

request.urlretrieve(img_url,filename='img.jpg')

================================================================================================

数据提取:

正则re模块:

import re

text = 'hello'

# .匹配任意字符

com = re.compile('.+')

# match从头开始匹配

ret = re.match(com, text)

print(ret.group())

# \d匹配任意数字

# \D 匹配任意非数字

# \s 任意空白字符(\r \n \n)

# \w a-z A-Z 数字下划线

# \W 与\w相反

# [ ]组合，只要满足中括号中的某一项就算成功

# ^[0-9] 非0-9 (脱字号) ^a 以a开头

# + 一个或多个

# * 零个或多个

# ? 零个或一个

# {n} n个

# {m,n} m-n个

# $以结尾

# | 匹配多个字符或表达式

# 贪婪模式(默认) 非贪婪模式

text = '

标题1

ret = re.match('<.>', text) # 非贪婪模式

# ret = re.match('<.>', text)

print(ret.group())

# 匹配0-100之间的数字

text = '0'

ret = re.match('[1-9]\d?$|100$|0$', text)

print(ret.group())

# 转义字符 \

text = 'apple price is $299'

ret = re.search('\$(\d+)', text)

print(ret.group(1))

# 打印\n

# text = '\\n'

text = r'\n' # r原生的

print(text)

text = '\\n' # =>\n

# python \\\\n => \\n

# 正则 \\n=> \n

# ret = re.match('\\\\n', text)

ret = re.match(r'\\n', text)

print(ret.group())

# compile编译

# pattern = re.compile('\d+\.?\d+')

# 多行

pattern = re.compile("""

\d+

\.?

\d+

""", re.VERBOSE)

text = 'the number is 20.50'

s = re.search(pattern, text)

print(s.group())

'''

Pattern 对象的一些常用方法主要有：

match 方法：从起始位置开始查找，一次匹配

search 方法：从任何位置开始查找，一次匹配

findall 方法：全部匹配，返回列表

finditer 方法：全部匹配，返回迭代器

split 方法：分割字符串，返回列表

sub 方法：替换

'''

# match

pattern = re.compile(r'\d+')

m = pattern.match('one123a', 3, 7)

print(m) # <_sre.sre_match object span="(3," match="123">

print(m.group()) # 123

print(m.start()) # 3

print(m.end()) # 6

print(m.span()) # (3, 6) 返回匹配成功的整个子串的索引

# search

text = "apple's price $99,orange's price is $10"

pattern = re.compile('.*(\$\d+).*(\$\d+)', re.I) # # re.I 表示忽略大小写

ret = pattern.search(text)

# print(ret.group(0)) 相当于print(ret.group())

print(ret.group(1))

print(ret.group(2))

# 所有的组

print(ret.groups()) # ('$99', '$10')

print(ret.span()) # (0, 39) # 起始位置和结束位置

# findall

ret = re.findall('\$\d+', text)

print(ret) # ['$99', '$10']

# finditer

res = re.finditer('\$\d+', text)

for m in res:

print(m.group(), m.span()) # $99 (14, 17) # $10 (36, 39)

# split

text1 = 'hello2world ni hao '

ret = re.split(' |\d', text1) # 空格或数字

print(ret) # ['hello', 'world', 'ni', 'hao', '']

# sub

ret = re.sub('\$\d+', '0', text)

print(ret) # apple's price 0,orange's price is 0

pattern = re.compile(r'(\w+) (\w+)') # [A-Za-z0-9]

s = 'a b,c d'

res = pattern.sub('123', s)

print(res) # 123,123

res = pattern.sub(r'\2 \1', s) # 引用分组

print(res) # b a,d c

print(pattern.sub(lambda m: m.group(2) + m.group(1), s)) # ba,dc

print(pattern.sub(lambda m: m.group(2) + m.group(1), s, 1)) # ba,c d # 最多替换一次

# 匹配中文

title = '你好，hello，世界'

pattern = re.compile(r'[\u4e00-\u9fa5]+')

result = pattern.findall(title)

print(result) # ['你好', '世界']

=======================================================================================

XPath 开发工具

开源的XPath表达式编辑工具:XMLQuire(XML格式文件可用)

Chrome插件 XPath Helper

Firefox插件 try XPath

最常用的路径表达式：

表达式描述

nodename 选取此节点的所有子节点。

/ 从根节点选取。

// 从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置。

. 选取当前节点。

.. 选取当前节点的父节点。

@ 选取属性。

lxml库:

lxml 是一个HTML/XML的解析器，主要的功能是如何解析和提取 HTML/XML 数据。

lxml python 官方文档：http://lxml.de/index.html

'''

最常用的路径表达式：

表达式描述

nodename 选取此节点的所有子节点。

/ 从根节点选取。子元素

// 从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置。

. 选取当前节点。

.. 选取当前节点的父节点。

@ 选取属性。

路径表达式结果

/bookstore/book[1] 选取属于 bookstore 子元素的第一个 book 元素。

/bookstore/book[last()] 选取属于 bookstore 子元素的最后一个 book 元素。

/bookstore/book[last()-1] 选取属于 bookstore 子元素的倒数第二个 book 元素。

/bookstore/book[position()<3] 选取最前面的两个属于 bookstore 元素的子元素的 book 元素。

//title[@lang] 选取所有拥有名为 lang 的属性的 title 元素。

//title[@lang=’eng’] 选取所有 title 元素，且这些元素拥有值为 eng 的 lang 属性。

/text() 文本内容

//input[starts-with(@name,'name1')] 查找name属性中开始位置包含'name1'关键字的页面元素

//input[contains(@name,'na')] 查找name属性中包含na关键字的页面元素

通配符描述

* 匹配任何元素节点。

@* 匹配任何属性节点。

'''

# lxml

from lxml import etree

text = '''

first item
second item
third item
fourth item
fifth item # 注意，此处缺少一个闭合标签

'''

# html = etree.HTML(text) # 利用etree.HTML，将字符串解析为HTML文档

# res = etree.tostring(html) # 按字符串序列化HTML文档

# print(res) # lxml 可以自动修正 html 代码，例子里不仅补全了 li 标签，还添加了 body，html 标签

# print(type(html)) # # 显示etree.parse() 返回类型

# parser = etree.HTMLParser(encoding='utf-8')

# html = etree.parse(source='tencent.html', parser=parser) # 读取外部文件

# print(etree.tostring(html, pretty_print=True)) # 美观的打印

# 实例测试

html = etree.HTML(text)

res = html.xpath('//li/@class') # 获取

标签的所有 class属性

print(res) # ['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']

res = html.xpath('//li/a[@href="link1.html"]') # 获取

标签下hre 为 link1.html 的标签

print(res) # []

res = html.xpath('//li/a/@href') # 获取

标签下的