Python爬虫基础

最新推荐文章于 2024-11-04 14:27:30 发布

陈崇人挺好

最新推荐文章于 2024-11-04 14:27:30 发布

阅读量838

点赞数 2

分类专栏： pyhton 网络爬虫文章标签：爬虫 python 开发语言

本文链接：https://blog.csdn.net/qwer2001727/article/details/123445045

版权

pyhton 网络爬虫专栏收录该内容

6 篇文章 0 订阅

订阅专栏

#导入功能库 urllib2
import urllib2
 
#调用urllib2库的urlopen方法 第一个参数url即为URL
response = urllib2.urlopen("http://www.baidu.com")
 
#response 对象有一个 read 方法，可以返回获取到的网页内容。
print response.read()

response = urllib2.urlopen("http://www.baidu.com"

首先我们调用的是 urllib2 库里面的 urlopen 方法，传入一个 URL，这个网址是百度首页，协议是 HTTP 协议，当然你也可以把 HTTP 换做 FTP,FILE,HTTPS 等等，只是代表了一种访问控制协议，urlopen 一般接受三个参数，它的参数如下：urlopen(url, data, timeout) 第一个参数 url 即为 URL，第二个参数 data 是访问 URL 时要传送的数据，第三个 timeout 是设置超时时间。第二三个参数是可以不传送的，data 默认为空 None，timeout 默认为 socket._GLOBAL_DEFAULT_TIMEOUT 第一个参数 URL 是必须要传送的，在这个例子里面我们传送了百度的 URL，执行 urlopen 方法之后，返回一个 response 对象，返回信息便保存在这里面。

print "第二种方法"
request = urllib2.Request(url)
#模拟Mozilla浏览器进行爬虫
request.add_header("user-agent","Mozilla/5.0")
response2 = urllib2.urlopen(request)
print response2.getcode()
print len(response2.read())

print "第三种方法"
cookie = cookielib.CookieJar()
#加入urllib2处理cookie的能力
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
urllib2.install_opener(opener)
response3 = urllib2.urlopen(url)
print response3.getcode()
print len(response3.read())
print cookie

爬虫程序添加data、header,然后post请求

#导入功能库库名 urllib  urllib2  
import urllib  
import urllib2  
 
#填写地址url
url = 'http://www.server.com/login'
#设置Headers 的参数
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
#设置data 参数是访问URL时要传送的数据
values = {'username' : 'cqc',  'password' : 'XXXX' }  
#设置Headers 的属性
headers = { 'User-Agent' : user_agent }  
#对data数据进行编码
data = urllib.urlencode(values) 
#进行请求
request = urllib2.Request(url, data, headers)  
#进行访问
response = urllib2.urlopen(request)  
#返回获取到的网页内容
page = response.read()

爬虫程序添加cookie

import urllib
import urllib2
import cookielib
 
filename = 'cookie.txt'
#声明一个MozillaCookieJar对象实例来保存cookie，之后写入文件
cookie = cookielib.MozillaCookieJar(filename)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
postdata = urllib.urlencode({
			'stuid':'201200131012',
			'pwd':'23342321'
		})
#登录教务系统的URL
loginUrl = 'http://jwxt.sdu.edu.cn:7890/pls/wwwbks/bks_login2.login'
#模拟登录，并把cookie保存到变量
result = opener.open(loginUrl,postdata)
#保存cookie到cookie.txt中
cookie.save(ignore_discard=True, ignore_expires=True)
#利用cookie请求访问另一个网址，此网址是成绩查询网址
gradeUrl = 'http://jwxt.sdu.edu.cn:7890/pls/wwwbks/bkscjcx.curscopre'
#请求访问成绩查询网址
result = opener.open(gradeUrl)
print result.read()

正则表达式是对字符串操作的一种逻辑公式，就是用事先定义好的一些特定字符、及这些特定字符的组合，组成一个“规则字符串”，这个“规则字符串”用来表达对字符串的一种过滤逻辑

常用的元字符

代码	说明
.	匹配除换行符以外的任意字符
\w	匹配字母或数字或下划线或汉字
\s	匹配任意的空白符
\d	匹配数字
\b	匹配单词的开始或结束
^	匹配字符串的开始（在集合字符里[^a]表示非（不匹配）的意思
$	匹配字符串的结束

import re
#定义正则化规则=匹配模式，r表示原生字符串
pattern=re.compile(r"hello")
#匹配文本
# result1=re.search(pattern,"hello hello");
# # print(result1)
result1=re.match(pattern,"hello hello")
print(result1)
#点代表任意字符
pattern2=re.compile(r'a.c')
result2=re.match(pattern2,'abcdefgg')
print(result2)
#\转义字符
pattern3=re.compile(r'a\.c')
result3=re.match(pattern3,'a.cdefgg')
print(result3)
#[...]字符串中间包含 -包含 ^不包含
pattern4=re.compile(r"a[a-z,A-z]bc")
result4=re.match(pattern4,r'aabcv')
print(result4)
#\d数字 \D不是数字
pattern6=re.compile(r"a\dbc")
result6=re.match(pattern6,'a6bcdd')
print(result6)
#\s空白字符 \S不是空白字符
pattern7=re.compile(r"a\sbc")
result7=re.match(pattern7,'a bcdd')
print(result7)
#\w 单词字符[A-Z,a-z,0-9] \W非单词字符
pattern8=re.compile(r"a\wbc")
result8=re.match(pattern8,'a bcdd')
print(result8)
#匹配邮箱
pattern9=re.compile(r"\d+@\w+\.\w+")
result9=re.search(pattern9,"1231qw@qq.com")
print(result9)
#*表示0个或者无无限次
rexg=re.compile(r'\d*\w*')
res=re.search(rexg,'1dddd')
print(res)
#+前一个字符一次或者无限次
rexg=re.compile(r'\d+\w')
res=re.search(rexg,'1dddd')
print(res)
#？表示一个或者0个
rexg=re.compile(r'\d?ddd')
res=re.search(rexg,'123dddd')
res2=re.match(rexg,'123dddd')
print(res)
print(res2)
#{m}匹配前一个字符m个
rexg=re.compile(r'1\d{10}')
res=re.search(rexg,'16666666666')
print(res)
#{m,n}m至n次
rexg=re.compile(r'\d{5,12}@\w{2}\.\w{3}')
res=re.search(rexg,'1436619325@qq.com')
print(res)
#非贪恋模式
rexg=re.compile(r'\d{5,10}?')
res=re.search(rexg,'1436619325')
print(res)
#^字符串开头支持多行
rexg=re.compile(r'^abc')
res=re.search(rexg,'abc123')
print(res)
#$字符串结尾支持多行
rexg=re.compile(r'abc$')
res=re.search(rexg,'123abc')
print(res)
#\A字符串结尾支持多行
rexg=re.compile(r'\Aabc')
res=re.search(rexg,'abc123')
print(res)
#\z字符串结尾支持多行
rexg=re.compile(r'abc\Z')
res=re.search(rexg,'123abc')
print(res)
#|满足任意提交
rexg=re.compile(r'1\d{10}|d{5,12}@qq\.com')
res=re.search(rexg,'dsafsd13424234324234sssss111111111111@qq.com')
print(res)
#分组
rexg=re.compile(r'(abc){3}')
res=re.search(rexg,'abcabcabcss')
print(res)
#分组+别名
rexg=re.compile(r'(?P<tt>abc)88(?P=tt)')
res=re.search(rexg,'abcabc88abcss')
print(res)
#分组+编号
rexg=re.compile(r'(\d{3})uu\1')
res=re.search(rexg,'123uu123')
print(res)