Python网络爬虫

大胡子哥哥哥

于 2022-03-13 21:51:35 发布

阅读量121

点赞数

文章标签： python

本文链接：https://blog.csdn.net/m0_65668020/article/details/123467059

版权

#导入功能库 urllib2

import urllib2

#调用urllib2库的urlopen方法第一个参数url即为URL

response = urllib2.urlopen("http://www.baidu.com")

#response 对象有一个 read 方法，可以返回获取到的网页内容。

print response.read()

爬虫程序添加data、header,然后post请求

#导入功能库库名 urllib urllib2

import urllib

import urllib2

#填写地址url

url = 'http://www.server.com/login'

#设置Headers 的参数

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

#设置data 参数是访问URL时要传送的数据

values = {'username' : 'cqc', 'password' : 'XXXX' }

#设置Headers 的属性

headers = { 'User-Agent' : user_agent }

#对data数据进行编码

data = urllib.urlencode(values)

#进行请求

request = urllib2.Request(url, data, headers)

#进行访问

response = urllib2.urlopen(request)

#返回获取到的网页内容

page = response.read()

添加cookie

import urllib

import urllib2

import cookielib

filename = 'cookie.txt'

#声明一个MozillaCookieJar对象实例来保存cookie，之后写入文件

cookie = cookielib.MozillaCookieJar(filename)

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))

postdata = urllib.urlencode({

'stuid':'201200131012',

'pwd':'23342321'

})

#登录教务系统的URL

loginUrl = 'http://jwxt.sdu.edu.cn:7890/pls/wwwbks/bks_login2.login'

#模拟登录，并把cookie保存到变量

result = opener.open(loginUrl,postdata)

#保存cookie到cookie.txt中

cookie.save(ignore_discard=True, ignore_expires=True)

#利用cookie请求访问另一个网址，此网址是成绩查询网址

gradeUrl = 'http://jwxt.sdu.edu.cn:7890/pls/wwwbks/bkscjcx.curscopre'

#请求访问成绩查询网址

result = opener.open(gradeUrl)

print result.read()

正则表达式

正则表达式是对字符串操作的一种逻辑公式，就是用事先定义好的一些特定字符、及这些特定字符的组合，组成一个“规则字符串”，这个“规则字符串”用来表达对字符串的一种过滤逻辑

常用的元字符

代码	说明
.	匹配除换行符以外的任意字符
\w	匹配字母或数字或下划线或汉字
\s	匹配任意的空白符
\d	匹配数字
\b	匹配单词的开始或结束
^	匹配字符串的开始（在集合字符里[^a]表示非（不匹配）的意思
$	$ 匹配字符串的结束

import re

#定义正则化规则=匹配模式，r表示原生字符串

pattern=re.compile(r"hello")

#匹配文本

# result1=re.search(pattern,"hello hello");

# # print(result1)

result1=re.match(pattern,"hello hello")

print(result1)

#点代表任意字符

pattern2=re.compile(r'a.c')

result2=re.match(pattern2,'abcdefgg')

print(result2)

#\转义字符

pattern3=re.compile(r'a\.c')

result3=re.match(pattern3,'a.cdefgg')

print(result3)

#[...]字符串中间包含 -包含 ^不包含

pattern4=re.compile(r"a[a-z,A-z]bc")

result4=re.match(pattern4,r'aabcv')

print(result4)

#\d数字 \D不是数字

pattern6=re.compile(r"a\dbc")

result6=re.match(pattern6,'a6bcdd')

print(result6)

#\s空白字符 \S不是空白字符

pattern7=re.compile(r"a\sbc")

result7=re.match(pattern7,'a bcdd')

print(result7)

#\w 单词字符[A-Z,a-z,0-9] \W非单词字符

pattern8=re.compile(r"a\wbc")

result8=re.match(pattern8,'a bcdd')

print(result8)

#匹配邮箱

pattern9=re.compile(r"\d+@\w+\.\w+")

result9=re.search(pattern9,"1231qw@qq.com")

print(result9)

#*表示0个或者无无限次

rexg=re.compile(r'\d*\w*')

res=re.search(rexg,'1dddd')

print(res)

#+前一个字符一次或者无限次

rexg=re.compile(r'\d+\w')

res=re.search(rexg,'1dddd')

print(res)

#？表示一个或者0个

rexg=re.compile(r'\d?ddd')

res=re.search(rexg,'123dddd')

res2=re.match(rexg,'123dddd')

print(res)

print(res2)

#{m}匹配前一个字符m个

rexg=re.compile(r'1\d{10}')

res=re.search(rexg,'16666666666')

print(res)

#{m,n}m至n次

rexg=re.compile(r'\d{5,12}@\w{2}\.\w{3}')

res=re.search(rexg,'1436619325@qq.com')

print(res)

#非贪恋模式

rexg=re.compile(r'\d{5,10}?')

res=re.search(rexg,'1436619325')

print(res)

#^字符串开头支持多行

rexg=re.compile(r'^abc')

res=re.search(rexg,'abc123')

print(res)

#$字符串结尾支持多行

rexg=re.compile(r'abc$')

res=re.search(rexg,'123abc')

print(res)

#\A字符串结尾支持多行

rexg=re.compile(r'\Aabc')

res=re.search(rexg,'abc123')

print(res)

#\z字符串结尾支持多行

rexg=re.compile(r'abc\Z')

res=re.search(rexg,'123abc')

print(res)

#|满足任意提交

rexg=re.compile(r'1\d{10}|d{5,12}@qq\.com')

res=re.search(rexg,'dsafsd13424234324234sssss111111111111@qq.com')

print(res)

#分组

rexg=re.compile(r'(abc){3}')

res=re.search(rexg,'abcabcabcss')

print(res)

#分组+别名

rexg=re.compile(r'(?P<tt>abc)88(?P=tt)')

res=re.search(rexg,'abcabc88abcss')

print(res)

#分组+编号

rexg=re.compile(r'(\d{3})uu\1')

res=re.search(rexg,'123uu123')

print(res)

大胡子哥哥哥

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python网络爬虫

#导入功能库 urllib2import urllib2#调用urllib2库的urlopen方法第一个参数url即为URLresponse = urllib2.urlopen("http://www.baidu.com")#response 对象有一个 read 方法，可以返回获取到的网页内容。print response.read()...
复制链接

扫一扫