爬虫简明教程

  1. 安装requests库
    cmd安装 pip install requests
import requests

r = requests.get("http://www.baidu.com")

#找出r的内容
r.text 解码成文本文件
r.content 看二进制代码(一般用于获取图片用二进制代码)
#解码方式
>>>r.encoding   'utf-8'

http有很多请求类型(最常见)

  1. get 请求,向特定资源发送请求
  2. post 请求,向指定资源提供数据和发送请求

在豆瓣网上搜索一些内容:
https://www.douban.com/search?q=python(出现q=python)
搜索的内容以网址的形式运行。

import requests

r = requests.get("http://www.douban.com")
r.encoding = 'utf-8'
dict = {'q':'java','cat':1001}

r = requests.get('http://www.douban.com',params=dict)
print(r.url)
#用params传递参数

有时候需要登录一个网站

#传入用户名和密码
dict = {'user':'wuqinglas','password':'12345678'}
r =requests.post('http://www.douban.com',data = dict,headers = header)
r.status_code
#请求头header防止反爬虫。
header = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}

2字头 代表请求已成功被服务器接收成功
3字头 301重定向 r.history
4字头 请求错误
5、6字头 服务器错误

实例:爬取Q房网

import requests
from lxml import etree
import csv
import time

def writecsv(item):
    with open ('Q房.csv','a',encoding='utf-8') as f:
        writer = csv.writer(f)
        try:
            writer.writerow(item)
        except:
            print('write error')
if __name__ == '__main__':
    headers ={
        "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
    }

start_url = "https://shenzhen.qfang.com/garden/n"
for url in range(1,2):
    houseurl = start_url+str(url)
    r = requests.get(houseurl,headers=headers)
    time.sleep(1)
    selector = etree.HTML(r.text)
    xiaoqulist = selector.xpath('/html/body/div[4]/div/div/div[3]/ul/li')
    #print(xiaoqulist)
    for xiaoqu in xiaoqulist:
        name = xiaoqu.xpath('div[2]/div[1]/a/text()')[0]
        #print(name)
        bankuai = xiaoqu.xpath('div[2]/div[4]/div/text()')[0]
        #print(bankuai)
        junjia = xiaoqu.xpath('div[3]/div[1]/span[1]/text()')[0]
        #print(junjia)
        item = [name,bankuai,junjia]
        writecsv(item)
        print('正在抓取……………………',name)


#/html/body/div[4]/div/div/div[3]/ul/li[1]
#/html/body/div[4]/div/div/div[3]/ul/li[1]/div[2]/div[1]/a
#/html/body/div[4]/div/div/div[3]/ul/li[1]/div[2]/div[4]/div
#/html/body/div[4]/div/div/div[3]/ul/li[1]/div[3]/div[1]/span[1]

爬取微信文章

from lxml import etree
import requests
def spider(url):
    r = requests.get(url,headers=headers,proxies=proxies)
    return etree.HTML(r.text)
def get_all_url(yeshu,neirong):
    for sousuoye in range(1,int(yeshu)+1):
        sousuo_url = "https://weixin.sogou.com/weixin?query="+neirong+"&_sug_type_=&s_from=input&_sug_=n&type=2&page="+str(yeshu)+"&ie=utf8"
        selector = spider(sousuo_url)
        meiye_url = selector.xpath('//div[@class="txt-box"]/h3/a/@href')
        all_url.append(meiye_url)
def towrite(wenzi,title):
    try:
        with open('./wenjian/'+title.replace('|','')+'.txt','wt',encoding='utf-8') as f:
            f.write(wenzi)
            print("正在下载",title)
    except:
        proxies("下载失败")

def spider_xiangqing_url(url):
    selector = spider(url)
    title = selector.xpath('//*[@id="activity-name"]/text()')[0].strip()
    neirong = selector.xpath('//p/text()')[0]
    wenzi = neirong.xpath('string(.)').strip().replace('\r','').replace('\n','')
    wenzi.encode('utf-8')
    towrite(wenzi,title)

if __name__ == '__main__':

    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"
    }
    proxies = {
        "HTTP":"115.218.5.246",
        "HTTP":"171.35.163.119",
        "HTTP":"113.195.18.187"
    }
    sousuoneirong = input("请输入搜索内容")
    sousuoyeshu = input("请输入搜索页数(必须是自然数)")
    all_url=[]
    get_all_url(sousuoyeshu,sousuoneirong)
    for url in all_url:
        spider_xiangqing_url(url)
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值