以此网站为例
基础代码如下:
from lxml import etree
import requests
import csv
# 检查url地址
def check_link(url):
try:
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
Cookie = "__jsluid=a0ff288cfece0ec1cb5e22c4a24a6675; __jsl_clearance=1555571859.427|0|xIYeAeXRX%2BFS334V1evcRQStJdA%3D"
Host = "www.sac.net.cn"
r = requests.get(url, headers={'User-agent': ua, 'Cookie': Cookie, 'Host': Host})
# 设置头部信息
r.raise_for_status()
r.encoding = "utf-8"
#utf-8编码,具体根据实际情况而定
return r.text#返回文本,传入get_content
except:
print('无法链接服务器!