- 安装requests库
cmd安装 pip install requests
import requests
r = requests.get("http://www.baidu.com")
#找出r的内容
r.text 解码成文本文件
r.content 看二进制代码(一般用于获取图片用二进制代码)
#解码方式
>>>r.encoding 'utf-8'
http有很多请求类型(最常见)
- get 请求,向特定资源发送请求
- post 请求,向指定资源提供数据和发送请求
在豆瓣网上搜索一些内容:
https://www.douban.com/search?q=python(出现q=python)
搜索的内容以网址的形式运行。
import requests
r = requests.get("http://www.douban.com")
r.encoding = 'utf-8'
dict = {'q':'java','cat':1001}
r = requests.get('http://www.douban.com',params=dict)
print(r.url)
#用params传递参数
有时候需要登录一个网站
#传入用户名和密码
dict = {'user':'wuqinglas','password':'12345678'}
r =requests.post('http://www.douban.com',data = dict,headers = header)
r.status_code
#请求头header防止反爬虫。
header = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}
2字头 代表请求已成功被服务器接收成功
3字头 301重定向 r.history
4字头 请求错误
5、6字头 服务器错误
实例:爬取Q房网
import requests
from lxml import etree
import csv
import time
def writecsv(item):
with open ('Q房.csv','a',encoding='utf-8') as f:
writer = csv.writer(f)
try:
writer.writerow(item)
except:
print('write error')
if __name__ == '__main__':
headers ={
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
}
start_url = "https://shenzhen.qfang.com/garden/n"
for url in range(1,2):
houseurl = start_url+str(url)
r = requests.get(houseurl,headers=headers)
time.sleep(1)
selector = etree.HTML(r.text)
xiaoqulist = selector.xpath('/html/body/div[4]/div/div/div[3]/ul/li')
#print(xiaoqulist)
for xiaoqu in xiaoqulist:
name = xiaoqu.xpath('div[2]/div[1]/a/text()')[0]
#print(name)
bankuai = xiaoqu.xpath('div[2]/div[4]/div/text()')[0]
#print(bankuai)
junjia = xiaoqu.xpath('div[3]/div[1]/span[1]/text()')[0]
#print(junjia)
item = [name,bankuai,junjia]
writecsv(item)
print('正在抓取……………………',name)
#/html/body/div[4]/div/div/div[3]/ul/li[1]
#/html/body/div[4]/div/div/div[3]/ul/li[1]/div[2]/div[1]/a
#/html/body/div[4]/div/div/div[3]/ul/li[1]/div[2]/div[4]/div
#/html/body/div[4]/div/div/div[3]/ul/li[1]/div[3]/div[1]/span[1]
爬取微信文章
from lxml import etree
import requests
def spider(url):
r = requests.get(url,headers=headers,proxies=proxies)
return etree.HTML(r.text)
def get_all_url(yeshu,neirong):
for sousuoye in range(1,int(yeshu)+1):
sousuo_url = "https://weixin.sogou.com/weixin?query="+neirong+"&_sug_type_=&s_from=input&_sug_=n&type=2&page="+str(yeshu)+"&ie=utf8"
selector = spider(sousuo_url)
meiye_url = selector.xpath('//div[@class="txt-box"]/h3/a/@href')
all_url.append(meiye_url)
def towrite(wenzi,title):
try:
with open('./wenjian/'+title.replace('|','')+'.txt','wt',encoding='utf-8') as f:
f.write(wenzi)
print("正在下载",title)
except:
proxies("下载失败")
def spider_xiangqing_url(url):
selector = spider(url)
title = selector.xpath('//*[@id="activity-name"]/text()')[0].strip()
neirong = selector.xpath('//p/text()')[0]
wenzi = neirong.xpath('string(.)').strip().replace('\r','').replace('\n','')
wenzi.encode('utf-8')
towrite(wenzi,title)
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
}
proxies = {
"HTTP":"115.218.5.246",
"HTTP":"171.35.163.119",
"HTTP":"113.195.18.187"
}
sousuoneirong = input("请输入搜索内容")
sousuoyeshu = input("请输入搜索页数(必须是自然数)")
all_url=[]
get_all_url(sousuoyeshu,sousuoneirong)
for url in all_url:
spider_xiangqing_url(url)