python小爬虫

首先我的需求是爬取图片,首先我有一个名字列表发现他们都是大写,而url的后缀全是小写我先写一个python将所有的文字变成小写

with open('C:/Users/张浩然/Desktop/新建文本文档.txt','r') as f:
    for i in f:
        i1 = i.lower()	//这个i必须是str,所以我才用for慢慢的带入
        f1 = open('C:/Users/张浩然/Desktop/12.txt','a')
        f1.write(i1)

此时我们已经将文件名字都改成小写了

import urllib	//调用urllib库
import urllib.request
cookie = "b541d8d2e159c0933faa2c75bba3e44c=07935a5639e5ba7325c8f3afbf72e269;	 508037ad21707ca9a9a449278f5918bf=5b1f9c70c28c008535aa81223d98914c; zenAdminID=c39irjm3rjsqd2lhcvppks10r6; _ga=GA1.2.662446443.1545625636; _fs_fid=6d36bcc61fde399e256fb4122911e5366e32c9e9; __lc.visitor_id.g9563165_0=S1545625636.684d3b5601; _ym_uid=1545625637357807893; _ym_d=1545625637; __lc.visitor_id.9563165=S1545625636.917159cb0d; _gcl_au=1.1.80444943.1553569604; _gid=GA1.2.964799538.1555319145; _fs_ses.en=25f8eb5a179681bb3740abf611d1654bc06accea; lc_sso9563165=1555473253484; _ym_isad=2; _fs_vid.en=6d36bcc61fde399e256fb4122911e5366e32c9e9.1545625636.48.1555473638.1555475438..1"	//把我的cookie放到cookie变量中(为了登陆)
header= {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',	//自定义header
'Connection':'keep-alive',		//自定义header
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',	//自定义header
'Cookie':cookie	//自定义header
    }
url="http://cn.fs.com:8006/YX_aRkDuKjEvLDp/images/admin/"	//定义url,空出后面的一项为了我们后面使用for.jpg
with open('C:/Users/张浩然/Desktop/12.txt','r') as f:	//打开一个文件以读的方式,并且用f代指
    
    for i in f:	//迭代我们的文件
        i = i.replace('\n','')		//将此次迭代中\n用空格代替
        url='http://cn.fs.com:8006/YX_aRkDuKjEvLDp/images/admin/{}.jpg'.format(i)	//将我们的url补全
        print(url)	//打印出url
        request = urllib.request.Request(url=url,headers=header,method='GET')		//生成http request,并且将其复制给变量request	
        try:	//异或处理,如果它下面的语句有异常就执行下面的except里的语句,分别是打印fuck no,还有退出此次循环
            response = urllib.request.urlopen(request)	//发送请求并且将server服务器返回的结果给response变量
        except:
            print("fuck no")
            continue
        get_img = response.read()		//读取返回的结果赋予变量get_img
        
        with open("C:/Users/张浩然/Desktop/fs/{name_}.jpg".format(name_=i),'wb') as fp:
            
            fp.write(get_img)
            print('download complete')

上述的代码爬取成功但是爬到的照片才不到100个,所以今天重写算法,加大力度
我们再次分析url,发现每个订单后面序号是按照顺序来的,而且订单页面有照片的连接所以我们可以将订单页面找出并且利用re匹配正则匹配出带有照片链接的哪一行,再使用urllib去下载代码如下

import urllib
import urllib.request
import re

cookie='b541d8d2e159c0933faa2c75bba3e44c=07935a5639e5ba7325c8f3afbf72e269; 508037ad21707ca9a9a449278f5918bf=5b1f9c70c28c008535aa81223d98914c; zenAdminID=opmn0dcn7j3eshbm77jgq93ia4; _ga=GA1.2.662446443.1545625636; _fs_fid=6d36bcc61fde399e256fb4122911e5366e32c9e9; __lc.visitor_id.g9563165_0=S1545625636.684d3b5601; _ym_uid=1545625637357807893; _ym_d=1545625637; __lc.visitor_id.9563165=S1545625636.917159cb0d; _gcl_au=1.1.80444943.1553569604; _gid=GA1.2.964799538.1555319145; lc_sso9563165=1555473253484; _fs_vid.en=6d36bcc61fde399e256fb4122911e5366e32c9e9.1545625636.48.1555473638.1555475438..1'
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',	
'Connection':'keep-alive',		
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',	
'Cookie':cookie	

    }
url=[]
url_new=[]
url_new1=[]
for i in range(4,800):	//匹配序列号为4800,因为有2K多个序列号就低调低调先爬个700多
    url='http://cn.fs.com:8006/YX_aRkDuKjEvLDp/consult_customer_info.php?id={}'.format(i)	//生成订单页面url
    request=urllib.request.Request(url=url,headers=header,method='GET')	//向订单页面生成发送请求
    try:
        response= urllib.request.urlopen(request)   #urlopen返回的是bytes ,向订单页面发送请求
    except:
        print("url error")
    else:
        html=response.read()	//将获得的请求读取
        html=html.decode('utf-8')	//因为获取的请求是字节数据我们将其转成utf-8编码
        img_url=re.search(r'images.*\.jpg',html)	//匹配正则匹配出图像链接
        try:
            img_url=img_url.group(0)	//将匹配出的链接返回的东西取出字符本身
        except:
            print("no match")
        else:
            url_new.append('http://cn.fs.com:8006/YX_aRkDuKjEvLDp/{}'.format(img_url))	//将完整的链接拼接成功并且放入一个listprint("match success")

print("match complete,now delect same")        

for n in url_new:	//查重
    if n not in url_new1:
        url_new1.append(n)
print(url_new1)
for j in url_new1:
    request=urllib.request.Request(url=j,headers=header,method='GET')
    try:
        response=urllib.request.urlopen(request)
    except:
        print("fuck no!!!")
    else:
        img=response.read()
        j=j[51:]	//取出从第51个字符开始并且到后结束
        with open("C:/Users/张浩然/Desktop/fs1/{}".format(j),'wb') as fp:	//wb代表代表以二进制的方式打开并写入数据,如果文件存在就覆盖
            fp.write(img)
            print('download complete')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值