爬虫基础-龙运堂

最新推荐文章于 2022-05-09 14:40:17 发布

龙运堂

最新推荐文章于 2022-05-09 14:40:17 发布

阅读量57

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/m0_68071733/article/details/123469167

版权

from urllib import request

#构造一个请求

req=request.Request("http://www.baidu.com")

#打开网页

resp = request.urlopen(req)

#读取网页的内容

print(resp.read())

#打开网页

reponse=request.urlopen("http://www.baidu.com")

#read方法读取网页内容

print(reqonse.read())

2、添加data、header

import urllib
from urllib import request
#制定URL
url = "http://www.zhihu.com/signin?next=%2F"
#请求头的部分内容:指定浏览器
user_agent = "Mozilla/4.0 (compatiblc; MSIE 5.5; windows NT)"
#表单的请求参数
values={'username':'27506004','password':'12587496321'}
data=urllib.parse.urlencode(values).encode(encoding='UTF8')
#构建请求头header
headers={'User-Agent':user_agent,
'Referer':'http://www.zhihu.com/articles'
}
#构建请求
req=request.Request(url,data=data,headers=headers)
#打开网页
resp=request.urlopen(req)
#读取网页内容
print(resp.read())

3、爬虫添加cookie

from urllib import request

from http import cookiejar

#定义cookie

cookie = cookiejar.CookieJar()

#定义一个cookie处理器，把cookie传进去

handler=request.HTTPCookieProcessor(cookie)

#定义下载器，cookie处理传进去

openner=request.build_opener(handler)

# 下载页面

resp=openner.open("http://www.baidu.com")

# 便利cookie

for item in cookie:

print('NAME='+item.name)

print('VALYE'+item.value)

# 模拟登录教务系统

from urllib import request

import urllib

from http import cookiejar

# 定义文件名

filename = 'cookie.txt'

# 声明MozillaCookieJar对象保存cookie

cookie = cookiejar.MozillaCookieJar(filename)

# 声明一个cookie处理器

handler = request.HTTPCookieProcessor(cookie)

# 定义处理

opener = request.build_opener(handler)

# 定义date：账号+密码

postdata = urllib.parse.urlencode({

'username': '23567454321',

'password': '**********'

}).encode(encoding='UTF8')

# 登录教务系统

loginUrl='http://jwc.hnshzy.cn:90/hnshjw/cas/longin.action'

# 模拟登录

result=opener.open(loginUrl,postdata)

# 保存cookie到文件

cookie.save(ignore_discard=True,ignore_expires=True)

# 利用保存的cookie请求新网址

new_url='http://jwc.hnshzy.cn:90/hnshjw/cas/longin.action'

# 请求新网页

try:

result=opener.open(new_url)

except request.HTTPError as e:

if hasattr(e,'cook'):

print(e.cook)

except request.URLError as e :

if hasattr(e,'reason'):

print(e.reason)

else:

print(result.read())

4、正则表达式

import re

# 定义正则规则

rexp=re.compile(r"\d{5,11}@\w{2}\.\w{3}")

# 匹配

result=re.match(rexp,'154944@qq.comujdhfbndhdh')

print(result)

# 贪婪模式

rexp2=re.compile(r'\w*')

# 匹配

result2=re.match(rexp2,"ssdshdiuwreiuwhr")

print(result2)

#边界

rexp3=re.compile(r"^dsd$")

result3=re.match(rexp3,"abcsdsd123")

print(result3)

rexp4=re.compile(r"\Aabc")

result4=re.match(rexp4,"abcsdsd")

print(result4)

rexp5=re.compile(r"a\b!bc")

result5=re.match(rexp5,"a!bcsdsd")

print(result5)

rexp6=re.compile(r"abc|efg")

result6=re.search(rexp6,"aboefgert")

print(result6)

rexp7=re.compile(r"(abc){2}")

result7=re.search(rexp7,"abcabcfgebs")

print(result7)

rexp8=re.compile(r"(?P<p1>abc)")

result8=re.search(rexp8,"abcefghijk")

print(result8)

rexp9=re.compile(r"(\d)abc\1")

result9=re.search(rexp9,"1abc1")

print(result9)

rexp10=re.compile(r"(?P<tt>abc)efg(?P=tt)")

result10=re.search(rexp10,"abcefgbc")

print(result10)

龙运堂

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫基础-龙运堂

#1from urllib import request#构造一个请求req=request.Request("http://www.baidu.com")#打开网页resp = request.urlopen(req)#读取网页的内容print(resp.read())#2#打开网页reponse=request.urlopen("http://www.baidu.com")#read方法读取网页内容print(reqonse.read())2、添加da
复制链接

扫一扫