python爬虫1

最新推荐文章于 2024-08-06 11:55:39 发布

liunianzml

最新推荐文章于 2024-08-06 11:55:39 发布

阅读量496

点赞数

分类专栏： python 文章标签： python爬虫

本文链接：https://blog.csdn.net/liunianzml/article/details/51857900

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

所用的python为python3.x，与python2.x在一些库的调用上有些许不同。

首先我们需要知道几个库 urlib ，urlib.request 这是网络请求的库

1.最简单的爬虫

爬取百度

import urllib
import urllib.request
# 爬虫百度，根据关键词进行爬取
data = {}
data['word'] = '火影'

url_values = urllib.parse.urlencode(data)
url = "http://www.baidu.com/s?"

full_url = url+url_values

data = urllib.request.urlopen(full_url).read()
data = data.decode('UTF-8')
print(data)

同样的道理我们爬取京东搜索商品信息

import urllib
import urllib.request
import re
#爬取京东商城的指定商品
data = {}
data['keyword'] = '被子'
url_valus = urllib.parse.urlencode(data)
url = "http://search.jd.com/Search?"
fullurl = url+url_valus+'&enc=utf-8'
print(fullurl)
d = urllib.request.urlopen(fullurl).read()
d = d.decode('utf-8')
print(d)
f = open('e:/test.txt', 'w')

src = 'title="(.*?)"'

s = re.findall(src, d)
for m in s:
    print(m)


c = d.encode('gbk','ignore')    # 先解码 然后编码
c = c.decode('gbk','ignore')
f.write(str(c))
f.close()

由于写入文本，需要是GBK编码。而原网页是Utf-8所以要先解码再编码

2 下载网页上的图片

#python3.4 爬虫教程
#爬取网站上的图片

import urllib.request
import socket
import re
import sys
import os
targetDir = "E:\\123"  #文件保存路径
def destFile(path):
    if not os.path.isdir(targetDir):
        os.mkdir(targetDir)
    pos = path.rindex('/')
    t = os.path.join(targetDir, path[pos+1:])
    return t

weburl = 'http://www.douban.com/'
webheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=weburl, headers=webheaders)  #构造请求报头
webpage = urllib.request.urlopen(req)  #发送请求报头
contentBytes = webpage.read()
##print(contentBytes.decode('utf-8'))

content = contentBytes.decode('utf-8')



pattern=re.compile('<img.*?src="(.*?)"',re.S)//正则表达式过滤图片网址信息
items = re.findall(pattern,content)
for itme in items:
    print(itme)
    urllib.request.urlretrieve(itme,destFile(itme))

3 需要登陆的网页，以我们学校的信息门户网站为例

需要提交数据。

import urllib.request
import re
import http.cookiejar
class USTC:
    def __init__(self):
        self.loginurl="http://portal.uestc.edu.cn/"
        self.header={
             'Connection': 'Keep-Alive',

             'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',


        }
        self.postDict = {'username': 'xxxxxxxx',
        'password': '<span style="font-family: Arial, Helvetica, sans-serif;">xxxxxxxx</span>
',



        }
    def getOpener(self,head):
        cj=http.cookiejar.CookieJar()
        pro=urllib.request.HTTPCookieProcessor(cj)
        opener = urllib.request.build_opener(pro)

        header = []
        for key, value in head.items():
            elem = (key, value)
            header.append(elem)
            opener.addheaders = header
        return opener
    def getdata(self):

        opener= self.getOpener(header)
        postData = urllib.parse.urlencode(self.postDict).encode('utf-8')
        print(postData)
        op = opener.open(self.loginurl, postData)
        data = op.read()
        print(data.decode())

header = {'Content-Type':'application/x-www-form-urlencoded',
        'Host':'idas.uestc.edu.cn',
        'Origin':'http://idas.uestc.edu.cn'
        'Referer'':http://idas.uestc.edu.cn/authserver/login?service=http%3A%2F%2Fportal.uestc.edu.cn%2F',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
        }
sdu = USTC()
sdu.getdata()

当爬取网页后，得到网页信息，要想得到指定项目的信息，需要用正则表达式进行过滤。