爬虫基础笔记

requests 库基本操作

在这里插入图片描述

#_author:  86138
#date:  2020/3/29
# import requests   #亚马逊爬取
# url = "https://www.amazon.cn/dp/B07CRHCK77?smid=A3CQWPW49OI3BQ&ref_=Oct_CBBBCard_dsk_asin2&pf_rd_r=N8WT13Q2P3085ZPV093X&pf_rd_p=5a0738df-7719-4914-81ee-278221dce082&pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-3"
# try:
#     kv = {'user-agent':'Mozilla/5.0'}
#     r = requests.get(url, headers = kv)
#     r.raise_for_status()
#     r.encoding = r.apparent_encoding
#     print(r.text[1000:2000])
# except:
#     print("爬取失败")


# import requests   #baidu搜索
# keyword = "python"
# try:
#     kv = {'wd':keyword}  #360 'wd'换成'q'
#     r = requests.get("http://www.baidu.com/s",params = kv)
#     print(r.request.url)
#     r.raise_for_status()
#     print(len(r.text))
# except:
#     print("爬取失败")


#  import requests #爬图
# >>> import os
# >>> url = "https://tvax1.sinaimg.cn/crop.0.0.1002.1002.180/b59af28fly8fodpz8zxejj20ru0rudh6.jpg?KID=imgbed,tva&Expires=1586155537&ssig=wNM13DZqCW"
# >>> root = "D://pics//"
# >>> path = root + "abc.jpg" #url.split('/')[-1]  # 路径 、文件名
# >>> try:
# ...     if not os.path.exists(root):  #判断是否存在目录
# ...         os.mkdir(root)
# ...     if not os.path.exists(path):  #判断是否存在文件
# ...         r = requests.get(url)
# ...         with open(path, 'wb') as f: #将返回的二进制写入文件中
# ...             f.write(r.content)
# ...             f.close()
# ...             print("文件保存成功")
# ...     else:
# ...         print("文件已存在")
# ... except:
# ...     print("爬取失败")

# # 爬歌/视频
# import requests
# import os
# url = "https://sharefs.yun.kugou.com/202004061144/6f43c939cfe9a79cd90dfb58cb6f00ad/G200/M03/04/18/CA4DAF5FCFeAR0zvADDFozYoPDI733.mp3"
# root = "D://pics//"
# path = root + "abc.mp3" #url.split('/')[-1]  # 路径 、文件名
# try:
#     if not os.path.exists(root):  #判断是否存在目录
#         os.mkdir(root)
#     if not os.path.exists(path):  #判断是否存在文件
#         r = requests.get(url)
#         with open(path, 'wb') as f: #将返回的二进制写入文件中
#             f.write(r.content)
#             f.close()
#             print("文件保存成功")
#     else:
#         print("文件已存在")
# except:
#     print("爬取失败")
#

# import requests  #查ip地址
# url = "http://m.ip138.com/ip.asp?ip="
# try:
#     r = requests.get(url + "ip地址")
#     r.raise_for_status()
#     r.encoding = r.apparent.encoding
#     print(r.text[-500:])
# except:
#     print("爬取失败")

beautifulsoup 库基本操作


在这里插入图片描述在这里插入图片描述
只有父节点相同的标签才可以平行遍历
在这里插入图片描述在这里插入图片描述
遍历实现(其他类似)
遍历实现
实际操作

# >>> import requests
# >>> r = requests.get("http://python123.io/ws/demo.html")
# >>> r.text
# '<html><head><title>This is a python demo page</title></head>\r\n<body>\r\n<p class="title"><b>The demo python introduces several python courses.</b></p>\r\n<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:\r\n<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>\r\n</body></html>'
# >>> demo = r.text
# >>> from bs4 import BeautifulSoup   #bs库使用
# >>> soup = BeautifulSoup(demo, "html.parser")
# >>> print(soup.prettify())
# <html>
#  <head>
#   <title>
#    This is a python demo page
#   </title>
#  </head>
#  <body>
#   <p class="title">
#    <b>
#     The demo python introduces several python courses.
#    </b>
#   </p>
#   <p class="course">
#    Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
#    <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
#     Basic Python
#    </a>
#    and
#    <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
#     Advanced Python
#    </a>
#    .
#   </p>
#  </body>
# </html>

# 查html页面中所有网址
# import requests
# r = requests.get("http://python123.io/ws/demo.html")
# demo = r.text
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(demo, "html.parser")
# for link in soup.find_all('a'):   #soup.find_all('a') == soup('a')      ##   <tag>一样     ## .find_all(name, attrs, recursive(True/false), string, **kwargs)
#     print(link.get('href'))


###爬取大学排名
# import requests   
# from bs4 import BeautifulSoup
# import bs4
#
# def getHTMLText(url):
#     try:
#         r = requests.get(url, timeout = 30)
#         r.raise_for_status()
#         r.encoding = r.apparent_encoding
#         return r.text
#     except:
#         return""
#
# def fillUnivList(ulist, html):
#     soup = BeautifulSoup(html, "html.parser")
#     for tr in soup.find('tbody').childen:
#         if isinstance(tr, bs4.element.Tag):
#             tds = tr('td')
#             ulist.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
#
# def printUnivList(ulist, num):
#     print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","总分"))
#     for i in range(num):
#         u = ulist[i]
#         print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))


##优化
#def printUnivList(ulist, num):
    # tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"   ##中英文混合格式化输出优化
    # print(tplt.format("排名","学校名称","总分",chr(12288)))
    # for i in range(num):
    #     u=ulist[i]
    #     print(tplt.format(u[0],u[1],u[2],chr(12288)))
    #


![在这里插入图片描述](https://img-blog.csdnimg.cn/20200412111524841.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0ZlaGxpbmdMZWU=,size_16,color_FFFFFF,t_70)
# def main():
#     unifo = []
#     url = 'https://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
#     html = getHTMLText(url)
#     fillUnivList(unifo, html)
#     printUnivList(unifo, 20) ##20unis
# main()

re库基本操作

在这里插入图片描述
在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述
在这里插入图片描述
实际操作

#正则表达式
## re.search()
# >>> import re
# >>> match = re.search(r'[1-9]\d{5}', 'BIT 100081')
# >>> if match:
# ...     print(match.group(0))
# ...
# 100081


# #re.match()
# >>> import re
# >>> match = re.match(r'[1-9]\d{5}', 'BIT 100081')
# >>> if match:
# ...     match.group(0)
# ...
# >>> match.group(0)
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# AttributeError: 'NoneType' object has no attribute 'group'
# >>>
# >>>
# >>>
# >>> import re
# >>> match = re.match(r'[1-9]\d{5}', '100081 BIT')
# >>> if match:
# ...     match.group(0)
# ...
# '100081'


# re.findall()
# >>> import re
# >>> ls = re.findall(r'[1-9]\d{5}', 'BIT100081 TSU100084')
# >>> ls
# ['100081', '100084']


# #re.split()
# >>> import re
# >>> re.split(r'[1-9]\d{5}', 'BIT100081 TSU100081')
# ['BIT', ' TSU', '']
# >>> re.split(r'[1-9]\d{5}', 'BIT100081 TSU100081', maxsplit = 1)
# ['BIT', ' TSU100081']


# #re.finditer()
# >>> import re
# >>> for m in re.finditer(r'[1-9]\d{5}', 'BIT100081 TSU100084'):    ##迭代类型
# ...     if m:
# ...         print(m.group(0))
# ...
# 100081
# 100084


## re.sub()
# >>> import re
# >>> re.sub(r'[1-9]\d{5}', ':zipcode', 'BIT100081 TSU100084')
# 'BIT:zipcode TSU:zipcode'



#
# >>> import re
# >>> m = re.search(r'[1-9]\d{5}', 'BIT100081 TSU100084')
# >>> m.string
# 'BIT100081 TSU100084'
# >>> m.re
# re.compile('[1-9]\\d{5}')
# >>> m.pos
# 0
# >>> m.endpos
# 19
# >>> m.group(0)
# '100081'
# >>> m.start()
# 3
# >>> m.end()
# 9
# >>> m.span()
# (3, 9)



#
# >>> match = re.search(r'PY.*N', 'PYANBNCNDN')#最长匹配
# >>> match.group(0)
# 'PYANBNCNDN'
# >>> match = re.search(r'PY.*?N', 'PYANBNCNDN') #最短匹配
# >>> match.group(0)
# 'PYAN'



# ##爬取某宝商品价格
# import requests
# import re
#
# def getHTMLText(url):
#     try:
#         r = requests.get(url, timeout = 30)
#         r.raise_for_status()
#         r.encoding = r.apparent_encoding
#         return r.text
#     except:
#         print("")
#
# def parsePage(ilt, html):
#     try:
#         plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
#         tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
#         for i in range(len(plt)):
#             price = eval(plt[i].split(':')[1])
#             title = eval(tlt[i].split(':')[1])
#             ilt.append([price, title])
#     except:
#         print("")
#
# def printGoodsList(ilt):
#     tplt = "{:4}\t{:8}\t{:16}"
#     print(tplt.format("序号", "价格", "商品名称"))
#     count = 0
#     for g in ilt:
#         count += 1
#         print(tplt.format(count, g[0], g[1]))
#
# def main():
#     goods = '书包'
#     depth = 3
#     star_url = 'https://s.taobao.com/search?q=' + goods
#     infoList = []
#     for i in range(depth):
#         try:
#             url = star_url + '&s=' + str(44 * i)
#             html = getHTMLText(url)
#             parsePage(infoList, html)
#         except:
#             continue
#     printGoodsList(infoList)
#
# main()


#爬取股票信息
# import requests
# from bs4 import BeautifulSoup
# import traceback
# import re
#
#
# def getHTMLText(url):
#     try:
#         r = requests.get(url)
#         r.raise_for_status()
#         r.encoding = r.apparent_encoding
#         return r.text
#     except:
#         return ""
#
#
# def getStockList(lst, stockURL):
#     html = getHTMLText(stockURL)
#     soup = BeautifulSoup(html, 'html.parser')
#     a = soup.find_all('a')
#     for i in a:
#         try:
#             href = i.attrs['href']
#             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
#         except:
#             continue
#
#
# def getStockInfo(lst, stockURL, fpath):
#     for stock in lst:
#         url = stockURL + stock + ".html"
#         html = getHTMLText(url)
#         try:
#             if html == "":
#                 continue
#             infoDict = {}
#             soup = BeautifulSoup(html, 'html.parser')
#             stockInfo = soup.find('div', attrs={'class': 'stock-bets'})
#
#             name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]
#             infoDict.update({'股票名称': name.text.split()[0]})
#
#             keyList = stockInfo.find_all('dt')
#             valueList = stockInfo.find_all('dd')
#             for i in range(len(keyList)):
#                 key = keyList[i].text
#                 val = valueList[i].text
#                 infoDict[key] = val
#
#             with open(fpath, 'a', encoding='utf-8') as f:
#                 f.write(str(infoDict) + '\n')
#         except:
#             traceback.print_exc()
#             continue
#
#
# def main():
#     stock_list_url = 'https://quote.eastmoney.com/stocklist.html'
#     stock_info_url = 'https://gupiao.baidu.com/stock/'
#     output_file = 'D:/BaiduStockInfo.txt'
#     slist = []
#     getStockList(slist, stock_list_url)
#     getStockInfo(slist, stock_info_url, output_file)
#
#
# main()
##优化速度,完成度器
# import requests
# from bs4 import BeautifulSoup
# import traceback
# import re
#
#
# def getHTMLText(url, code="utf-8"):  ##优化
#     try:
#         r = requests.get(url)
#         r.raise_for_status()
#         r.encoding = code    ##优化
#         return r.text
#     except:
#         return ""
#
#
# def getStockList(lst, stockURL):
#     html = getHTMLText(stockURL, "GB2312")   ##优化
#     soup = BeautifulSoup(html, 'html.parser')
#     a = soup.find_all('a')
#     for i in a:
#         try:
#             href = i.attrs['href']
#             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
#         except:
#             continue
#
#
# def getStockInfo(lst, stockURL, fpath):
#     count = 0    ##优化,完成度器
#     for stock in lst:
#         url = stockURL + stock + ".html"
#         html = getHTMLText(url)
#         try:
#             if html == "":
#                 continue
#             infoDict = {}
#             soup = BeautifulSoup(html, 'html.parser')
#             stockInfo = soup.find('div', attrs={'class': 'stock-bets'})
#
#             name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]
#             infoDict.update({'股票名称': name.text.split()[0]})
#
#             keyList = stockInfo.find_all('dt')
#             valueList = stockInfo.find_all('dd')
#             for i in range(len(keyList)):
#                 key = keyList[i].text
#                 val = valueList[i].text
#                 infoDict[key] = val
#
#             with open(fpath, 'a', encoding='utf-8') as f:
#                 f.write(str(infoDict) + '\n')
#                 count = count + 1    ##优化
#                 print("\r当前进度: {:.2f}%".format(count * 100 / len(lst)), end="")    ##优化
#         except:
#             count = count + 1   ##优化
#             print("\r当前进度: {:.2f}%".format(count * 100 / len(lst)), end="")   ##优化
#             continue
#
#
# def main():
#     stock_list_url = 'https://quote.eastmoney.com/stocklist.html'
#     stock_info_url = 'https://gupiao.baidu.com/stock/'
#     output_file = 'D:/BaiduStockInfo.txt'
#     slist = []
#     getStockList(slist, stock_list_url)
#     getStockInfo(slist, stock_info_url, output_file)
#
# main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值