Python学习笔记

最新推荐文章于 2024-10-08 12:37:10 发布

W1753900

最新推荐文章于 2024-10-08 12:37:10 发布

阅读量163

点赞数

文章标签： python

本文链接：https://blog.csdn.net/W1753900/article/details/123286363

版权

1.urllib补充

import urllib.request
获取一个get请求
response=urllib.request.urlopen('https://www.baidu.com')
print(response.read().decode('utf-8'))   #对获取到的网页源码 进行解码

获取一个post请求
import urllib.parse
data=bytes(urllib.parse.urlencode({'hello':'world'}),encoding=('utf-8'))
response=urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read().decode('utf-8'))

超时处理
try:
   response=urllib.request.urlopen('http://httpbin.org/get',timeout=0.01)
   print(response.read().decode('utf-8'))
except urllib.error.URLError as e:
    print('timeout')




response=urllib.request.urlopen('http://www.baidu.com')
print(response.status)
print(response.getheader('Server'))



url='http://httpbin.org/post'
headers={
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62'
}
data=bytes(urllib.parse.urlencode({'hello':'world'}),encoding='utf-8')
req=urllib.request.Request(url=url,headers=headers,method='POST',data=data)
response=urllib.request.urlopen(req)
print(response.read().decode('utf-8'))



url='http://www.douban.com'
headers={
   'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62'
  }
req=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

注：User-Agent的格式必须和网页源代码的格式一模一样否则不会爬取内容

utf-8目的是解析代码

2.豆瓣top250的爬取

from bs4 import BeautifulSoup   #网页分析
import xlwt   # 进行excel操作
import re    # 正则表达式，进行文字匹配
import urllib.request,urllib.error  # 进行数据获取
import sqlite3 #进行数字库操作
def main():
   baseurl='https://movie.douban.com/top250?start='
   #爬取网页
   datalist=getData(baseurl)
   savepath=r'.\\豆瓣电影Top250.xls'
   #保存数据
   # saveData(savepath)

   askURL('https://movie.douban.com/top250?start=0')

#爬取网页
def getData(baseurl):
    datalist=[]
    for i in range(0,10):    #调用页面信息的函数：10次
        url=baseurl+str(i*25)
        html=askURL(url)      #保存获取的源码
    #逐一解析
    return datalist



#得到指定的网页内容
def askURL(url):
    head={    #模拟浏览器头部信息，
        'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 98.0 .4758.102 Safari / 537.36 Edg / 98.0 .1108.62'
    }
        #用户代理，告诉豆瓣我们是什么类型的浏览器，
    request=urllib.request.Request(url,headers=head)
    html=""
    try:
       response=urllib.request.urlopen(request)
       html=response.read().decode('utf-8')
       print(html)
    except urllib.error.URLError as e:
       if hasattr(e,'code'):
        print(e.code)
       if hasattr(e,'reason'):
            print(e.reason)

    return html





def saveData(savepath):
    print('save......')


main()

3.如何爬取具体内容

from bs4 import BeautifulSoup
file=open('./baidu.html','rb')
html=file.read().decode('utf-8')
bs=BeautifulSoup(html,'html.parser')
print(bs.title)      #输出文件代码中第一个出现title的内容。输出：<title>百度一下，你就知道.<title>

#Tag   标签及其内容  只得到第一个出现的内容
print(bs.title.string)  #输出结果不会出现<title>
print(bs.a.attrs)   #可以快速拿到标签的属性


print(bs.a.string)   #输出标签中的字符串 新闻


#文档的遍历
print(bs.head.contents)  #输出head带有contents的标签
print(bs.head.contents[1])#输出head第一次出现的contents的标签
#更多文档搜索BeautfulSoup的文档



#文档的搜索
#find_all()   查询所有你要的信息
#字符串过滤 会查找与字符串相关的所有内容
t_list=bs.find_all('a')

import re
#正则表达式搜索：使用search()方法搜索
t_list=bs.find_all(re.compile('a'))  #包含a字样的

#用方法来搜索  #传入一个函数(方法），根据函数要求来搜索
def name_is_exists(tag):
    return tag.has_attr('name')
t_list=bs.find_all(name_is_exists)   #查找所有带name的标签