测试网站首页
测试网站爬取信息的子页面
直接请求子页面(带UA)
import requests
# from parsel
url = 'http://www.porters.vip/verify/cookie/content.html'
headers = {
'user-agent': 'PostmanRuntime/7.26.8'
}
response = requests.get(url=url, headers=headers)
# 403服务器已收到请求但是拒绝执行
# 404相应资源不存在
# 200服务器已经成功处理了请求
print(response.status_code)
print("-"*20)
print(response.text)
输出结果
--------------------
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- 上述3个meta标签*必须*放在最前面,任何其他内容都*必须*跟随其后! -->
<title>Steamboat-反爬虫练习</title>
<style>
.font-color {
color:#1eb97b;
}
.media-list {
margin-top:5px;
#####此处省略,该段代码是首页而非子页的html ######
换一个浏览器直接请求子页会发现页面被重定向到主页
http://www.porters.vip/verify/cookie/content.html
通过F12对照我发现index首页页面响应了一个cookie
这里用postman携带cookie直接尝试访问子页,cookie直接从ie上面的复制
访问成功,说明必须访问首页生成cookie然后携带cookie再去访问子页面、当然因为cookie是静态的没有生成时间参数,也可以从浏览器中复制
response.cookie自动生成cookie
import requests
from lxml import etree
# from parsel
url_index = 'http://www.porters.vip/verify/cookie/index.html'
headers = {
'user-agent': 'PostmanRuntime/7.26.8'
}
response_index = requests.get(url=url_index, headers=headers)
# 403服务器已收到请求但是拒绝执行
# 404相应资源不存在
# 200服务器已经成功处理了请求
print(response_index.status_code)
print("-"*20)
print(response_index.text)
cookies_index = response_index.cookies
url_son = 'http://www.porters.vip/verify/cookie/content.html'
response_son = requests.get(url=url_son, headers=headers, cookies=cookies_index)
print(response_index.status_code)
print("-"*20)
response_son_text = response_son.text
response_son_html = etree.HTML(response_son_text)
result_titles = response_son_html.xpath("//h1/text()")
result_person_time = response_son_html.xpath("//div[@class='page-header']/p/text()")
result_body = response_son_html.xpath("//div[@class='left col-md-10']/p/text()")
print(result_titles)
print(result_person_time)
print(result_body)
直接复制浏览器的
import requests
from lxml import etree
# from parsel
url_index = 'http://www.porters.vip/verify/cookie/index.html'
headers = {
'user-agent': 'PostmanRuntime/7.26.8'
}
response_index = requests.get(url=url_index, headers=headers)
# 403服务器已收到请求但是拒绝执行
# 404相应资源不存在
# 200服务器已经成功处理了请求
print(response_index.status_code)
print("-"*20)
print(response_index.text)
cookies_index = {'isfirst': '789kq7uc1pp4c'}
url_son = 'http://www.porters.vip/verify/cookie/content.html'
response_son = requests.get(url=url_son, headers=headers, cookies=cookies_index)
print(response_index.status_code)
print("-"*20)
print(response_son.text)
response_son_text = response_son.text
response_son_html = etree.HTML(response_son_text)
result_titles = response_son_html.xpath("//h1/text()")
result_person_time = response_son_html.xpath("//div[@class='page-header']/p/text()")
result_body = response_son_html.xpath("//div[@class='left col-md-10']/p/text()")
print(result_titles)
print(result_person_time)
print(result_body)