Python与爬虫自学

最新推荐文章于 2024-04-27 16:14:52 发布

ArashiMoon

最新推荐文章于 2024-04-27 16:14:52 发布

阅读量218

点赞数

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/ArashiMoon/article/details/107624127

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

Python与爬虫自学

由于疫情期间没有返校，本人认为关键的大二与大三衔接的暑假不能虚度，因此开坑学习（没能选上课的）Python和爬虫，在此做相应记录。如果最终时间有冲突，可能会以大三各事项优先而搁置该部分内容。
参考视频：
Python爬虫技术5天速成（2020全新合集）

Urllib库

# -*- coding = utf-8 -*-
# @Time     : 2020/7/25 13:58
# @Author   : ArashiMoon
# @File     : testUrllib.py
# @Software : PyCharm

import urllib.request

#获取一个get请求
# response=urllib.request.urlopen("http://www.baidu.com")
# print(response.read().decode('utf-8'))#对获取到的网页源码进行utf-8解码

#获取一个post请求
# import urllib.parse
# data=bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
# response=urllib.request.urlopen("http://httpbin.org/post",data=data)
# print(response.read().decode("utf-8"))

#超时处理
# try:
#     response=urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
#     print(response.read().decode("utf-8"))
# except urllib.error.URLError as e:
#     print("time out!")

# response=urllib.request.urlopen("http://www.baidu.com")
# print(response.status)
# print(response.getheaders())
# print(response.getheader("server"))

# url="http://httpbin.org/post"
# headers={
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
# }
# data=bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8")
# req=urllib.request.Request(url=url,data=data,headers=headers,method="POST")
# response=urllib.request.urlopen(req)
# print(response.read().decode("utf-8"))

#伪装浏览器
#url="https://www.douban.com"
#headers={
#    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) #Gecko/20100101 Firefox/78.0"
#}
#req=urllib.request.Request(url=url,headers=headers)
#response=urllib.request.urlopen(req)
#print(response.read().decode("utf-8"))

Bs4库

# -*- coding = utf-8 -*-
# @Time     : 2020/7/26 12:26
# @Author   : ArashiMoon
# @File     : testBs4.py
# @Software : PyCharm

'''
BeautifualSoup4将复杂HTML文档转换成一个复杂的树形结构，每个节点都是Python对象，所有对象可以归纳为4种：

-Tag
-NavigableString
-BeautifulSoup
-Comment

'''

from bs4 import BeautifulSoup

file=open("./baidu.html","rb")
html=file.read()
bs=BeautifulSoup(html,"html.parser")

# print(bs.title)
# print(bs.a)
# print(type(bs.head))

#1.Tag 标签及其内容：拿到它所找到的第一个内容

# print(bs.title.string)
# print(type(bs.title.string))

#2.NavigableString 标签里的内容（字符串）

# print(bs.a.attrs)#属性

# print(type(bs))

#3.BeautifulSoup 表示整个文档

# print(bs.name)
# print(bs.attrs)
# print(bs)

# print(bs.a.string)
# print(type(bs.a.string))

#4.Comment 是一个特殊的NavigableString，输出的内容不包含注释符号

'''---------------------------------------------------'''

#文档的遍历

# print(bs.head.contents) #列表
# print(bs.head.contents[1])
# print(type(bs.head.children))

#更多内容,搜索BeautifulSoup文档



#文档的搜索

#(1) find_all()
#字符串过滤：会查找与字符串完全匹配的内容
# t_list=bs.find_all("a")
# print(t_list)

import re
#正则表达式搜索：使用search()来匹配内容
# t_list=bs.find_all(re.compile("a"))
# print(t_list)

#方法：传入一个函数（方法），根据函数的要求来搜索 (了解)
# def name_is_exists(tag):
#     return tag.has_attr("name")
#
# t_list=bs.find_all(name_is_exists)
#
# for item in t_list:
#     print(item)
#
# print(t_list)

#(2).kwargs  参数 keyword args
# t_list=bs.find_all(id="head")
# t_list=bs.find_all(href="http://news.baidu.com")
# t_list=bs.find_all(class_=True)

# for item in t_list:
#     print(item)

#(3).text参数

# t_list=bs.find_all(text="hao123")
# t_list=bs.find_all(text=["hao123","地图","贴吧"])

# t_list=bs.find_all(text=re.compile("\d"))#应用正则表达式来查找包含特定文本的内容（标签里的字符串）

#(4).limit参数

# t_list=bs.find_all("a",limit=3)
# for item in t_list:
#     print(item)

#css选择器
# t_list=bs.select("title") #通过标签来查找

# t_list=bs.select(".mnav")
# t_list=bs.select("#u1")
# t_list=bs.select("a[class='bri']")#注意单引号和双引号 通过属性来查找
# t_list=bs.select("head>title")#通过子标签来查找
# t_list=bs.select(".mnav~.bri")

# print(t_list[0].get_text())

# for item in t_list:
#     print(item)

re库

# -*- coding = utf-8 -*-
# @Time     : 2020/7/28 20:59
# @Author   : ArashiMoon
# @File     : testRe.py
# @Software : PyCharm

#正则表达式：字符串模式（判断字符串是否符合一定的标准）

import re
#创建模式对象
# pat=re.compile("AA")#此处的AA是正则表达式，用来验证其他的字符串
# m=pat.search("CBA")    #search的字符串：被校验的内容 #输出None（不匹配）

# m=pat.search("ABCAA")#输出<re.Match object; span=(3, 5), match='AA'>
# m=pat.search("AABCAADDCCAAA")#输出<re.Match object; span=(0, 2), match='AA'>
#search方法进行比对查找

#没有模式对象
# m=re.search("asd","Aasd")#<re.Match object; span=(1, 4), match='asd'>
#前面的字符串是规则（模板），后面的字符串是被校验的对象
# print(m)

# print(re.findall("a","ASDaDFGAa"))#前面字符串是规则（正则表达式），后面字符串是被校验的字符串

# print(re.findall("[A-Z]","ASDaDFGAa"))#['A', 'S', 'D', 'D', 'F', 'G', 'A']

# print(re.findall("[A-Z]+","ASDaDFGAa"))#['ASD', 'DFGA']

#sub

# print(re.sub("a","A","abcdcasd"))#找到a用A替换 AbcdcAsd

#建议在正则表达式中，被比较的字符串前加上r，就不用担心转义字符的问题

Tips:
(1）:“惰性匹配”：注意(.*)和(.*?)的差别

xlwt库

# -*- coding = utf-8 -*-
# @Time     : 2020/7/29 23:20
# @Author   : ArashiMoon
# @File     : testxlwt.py
# @Software : PyCharm

import xlwt

# workbook=xlwt.Workbook(encoding="utf-8")        #创建workbook对象
# worksheet=workbook.add_sheet('sheet1')          #创建工作表
# worksheet.write(0,0,'hello')                    #写入数据，第一参数-行，第二参数-列，第三参数-内容
# workbook.save('student.xls')                    #保存数据表

#九九乘法表练习
# workbook=xlwt.Workbook(encoding="utf-8")
# worksheet=workbook.add_sheet('sheet1')
# for i in range(1,10):
#     for j in range(i,10):
#         worksheet.write(i-1,j-1,str(i)+"*"+str(j)+"="+str(i*j))
#         #worksheet.write(i-1,j-1,"%d * %d = %d"%(i,j,i*j)
# workbook.save('multi.xls')

Sqlite

# -*- coding = utf-8 -*-
# @Time     : 2020/7/31 13:36
# @Author   : ArashiMoon
# @File     : testSqlite.py
# @Software : PyCharm

import sqlite3
#1.连接数据库
conn=sqlite3.connect("test.db")         #打开或创建数据库文件

print("成功打开数据库")

#2.创建数据表
'''
c=conn.cursor()                     #获取游标

sql="""
    create table company
        (id int primary key not null,
        name text not null,
        age int not null,
        address char(50),
        salary real);
"""

c.execute(sql)                      #执行sql语句
conn.commit()                       #提交数据库操作
conn.close()                        #关闭数据库连接


print("成功建表")
'''
#3.插入数据

# c=conn.cursor()                     #获取游标
#
# sql1="""
#     insert into company (id,name,age,address,salary)
#     values (1,'张三',32,'成都',8000);
# """
#
# sql2="""
#     insert into company (id,name,age,address,salary)
#     values (2,'李四',30,'重庆',15000);
# """
#
# c.execute(sql1)                      #执行sql语句
# c.execute(sql2)                      #执行sql语句
# conn.commit()                       #提交数据库操作
# conn.close()                        #关闭数据库连接


# print("插入数据完毕")

#4.查询数据
# c=conn.cursor()                     #获取游标
# 
# sql="select id,name,age,address,salary from company"
# 
# cursor=c.execute(sql)                      #执行sql语句
# 
# for row in cursor:
#     print("id = ",row[0])
#     print("name = ",row[1])
#     print("age = ",row[2])
#     print("address = ",row[3])
#     print("salary = ",row[4],"\n")
# 
# conn.close()                        #关闭数据库连接
# 
# print("查询完毕")
#

最终的爬虫程序

# -*- coding = utf-8 -*-
# @Time     : 2020/7/15 13:04
# @Author   : ArashiMoon
# @File     : spider.py
# @Software : PyCharm

import re
# import bs4
import urllib.request,urllib.error
import xlwt
import sqlite3
from bs4 import BeautifulSoup
def main():
    baseurl="https://movie.douban.com/top250?start="
    #1.爬取网页
    datalist=getData(baseurl)

    #3.保存数据
    # savepath= ".\\豆瓣电影Top250.xls"
    dbpath="movie.db"
    # saveData(datalist,savepath)
    saveData2DB(datalist,dbpath)
    # askURL("http://movie.douban.com/top250?start=")

#影片详情链接
findLink=re.compile(r'<a href="(.*?)">')#创建正则表达式对象，表示规则（字符串的模式）
#影片图片
findImgSrc=re.compile(r'<img.*src="(.*?)"',re.S)#re.S让换行符包含在字符中
#影片片名
findTitle=re.compile(r'<span class="title">(.*?)</span>')
#影片评分
findRating=re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
#评价人数
findJudge=re.compile(r'<span>(\d*?)人评价</span>')
#概况
findInq=re.compile(r'<span class="inq">(.*?)</span>')
#影片相关内容
findBd=re.compile(r'<p class="">(.*?)</p>',re.S)

#爬取网页
def getData(baseurl):
    datalist= []
    for i in range(0,10):    #调用获取页面信息的函数，10次
        url=baseurl+str(i*25)
        html=askURL(url)     #保存获取到的网页源码
        # 2.逐一解析数据
        soup=BeautifulSoup(html,"html.parser")#↓注意下划线
        for item in soup.find_all("div",class_="item"):   #查找符合要求的字符串，形成列表
            # print(item)#测试：查看电影item全部信息
            data=[]                                 #保存一部电影的所有信息
            item=str(item)

            link=re.findall(findLink,item)[0]       #re库用来通过正则表达式查找指定的字符串
            data.append(link)                       #添加链接

            imgSrc=re.findall(findImgSrc,item)[0]
            data.append(imgSrc)                     #添加图片

            titles=re.findall(findTitle,item)       #片名可能只有一个中文名，没有外国名
            if(len(titles)==2):
                ctitle=titles[0]
                data.append(ctitle)                 #添加中文名
                otitle=titles[1].replace("/","")    #去掉无关符号
                data.append(otitle)                 #添加外国名
            else:
                data.append(titles[0])
                data.append(" ")                    #留空

            rating=re.findall(findRating,item)[0]
            data.append(rating)                     #添加评分

            JudgeNum=re.findall(findJudge,item)[0]
            data.append(JudgeNum)                   #添加评价人数

            inq=re.findall(findInq,item)
            if(len(inq)!=0):
                inq=inq[0].replace("。","")          #去掉句号
                data.append(inq)                     #添加概述
            else:
                data.append(" ")                    #留空

            bd=re.findall(findBd,item)[0]
            bd=re.sub(r'<br(\s+)?/>(\s+)?'," ",bd)   #替换<br/>
            bd=re.sub(r'/'," ",bd)                   #替换/
            data.append(bd.strip())                  #去掉前后的空格

            datalist.append(data)

    # print(datalist)
    return datalist

#得到指定一个URL的网页内容
def askURL(url):
    head={  #模拟浏览器头部信息，向豆瓣服务器发送消息
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64; rv: 78.0) Gecko / 20100101 Firefox / 78.0"
    }
    #用户代理，表示告诉豆瓣服务器，我们是什么类型的机器，浏览器（本质是告诉浏览器我们可以接受什么水平的文件内容）
    request=urllib.request.Request(url,headers=head)
    html=""
    try:
        response=urllib.request.urlopen(request)
        html=response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if(hasattr(e,"code")):
            print(e.code)
        if(hasattr(e,"reason")):
            print(e.reason)

    return html



#保存数据
def saveData(datalist,savepath):
    print("save")
    book=xlwt.Workbook(encoding="utf-8",style_compression=0)        #创建workbook对象
    sheet=book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)          #创建工作表
    col=("电影详情连接","图片链接","影片中文名","影片外国名","评分","评价数","概况","相关信息")
    for i in range(0,8):
        sheet.write(0,i,col[i])     #列名
    for i in range(0,250):
        # print("第%d条"%(i+1))
        data=datalist[i]
        for j in range(0,8):
            sheet.write(i+1,j,data[j])

    book.save(savepath)     #保存

def saveData2DB(datalist,dbpath):
    init_db(dbpath)
    conn=sqlite3.connect(dbpath)
    cur=conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index==4 or index==5:
                continue
            data[index] = '"'+data[index]+'"'
        sql='''
            insert into movie250 (
                info_link,pic_link,cname,ename,score,rated,instroduction,info)
            values(%s)
        '''%",".join(data)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()




def init_db(dbpath):
    sql='''
        create table movie250 (
            id integer primary key autoincrement,
            info_link text,
            pic_link text,
            cname varchar,
            ename varchar,
            score numeric,
            rated numeric,
            instroduction text,
            info text
        )
    '''
    conn=sqlite3.connect(dbpath)
    cursor=conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()


if __name__=="__main__":
    main()
    print("爬取完毕")