python http.Cookie_jar

最新推荐文章于 2024-05-22 16:28:54 发布

SayLove丶

最新推荐文章于 2024-05-22 16:28:54 发布

阅读量461

点赞数

分类专栏： python3 爬虫

本文链接：https://blog.csdn.net/qq_34776122/article/details/78084265

版权

python3 同时被 2 个专栏收录

16 篇文章

订阅专栏

爬虫

9 篇文章

订阅专栏

import urllib.request
import urllib.parse
import re
import http.cookiejar
def url_open(url='http://xxx/xxx.com'):
    data={'os_username':'*****','os_password':'******'}
    data=urllib.parse.urlencode(data).encode('utf-8')
    res=urllib.request.Request(url,data)
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.open(res)
    req=urllib.request.urlopen(res)
    html=req.read()
    return html

def del_span(name):
    if '<span>' in name  or '<strong>' in name or '<p>' in name:
        name=((name.split('>')[1]).split('<'))[0]
    else:
        pass
    return name

def pageid(url):
    mm=['23010896','23701365']
    array = []
    for nn in mm:
        idurl=url+'pages/viewpage.action?pageId=%s'%nn#组合URL
        html=url_open(idurl).decode('utf-8') #使用url_open 方法，打开url,获得页面的html源码
        a=html.find(r'<table class="confluenceTable"><thead>')
        b=html.find(r'</thead></table></div>',a)
        lists=html[a:b] #匹配出html中需要的那一节数据，缩小数据范围
        m=r'<tr>(.*?)</tr>'
        sun=re.findall(m,lists)#使用正则表达式，匹配出所有包含‘ r'<tr>(.*?)</tr>'’的数据
        mt='(\d{4}[-]\d+[-]\d+)'#日期正则表达式
        for i in sun:
            sum=len(re.findall(mt,i))#使用正则表达式，求出日期在字符串中出现的次数
            if sum==0 or sum==2:
                del i#匹配成功后，删除日期出现0次或者2次的字符串]
            else:
                mo='eTh">(.*?)</th>'
                name=re.findall(mo,i)
                add,modile,demand,develop,test,scope=name[1],name[2],name[3],name[4],name[5],name[9]#根据下标，把需要的类容赋值给相应的变量
                if r'修改' in name[9] or r'新增' in name[9] or r'添加' in name[9]:
                    scope=name[10]
                else:
                    scope=name[9]
                scope=del_span(scope)
                develop=del_span(develop)
                demand=del_span(demand)#使用del_span方法删除包含<span>字节，处理数据
                add=del_span(add)
                test=del_span(test)
                modile=del_span(modile)
                sm= add,modile,demand,develop,test,scope,#将需要是数据，重新组合成一个元组，赋值给sm
                array.append(sm)#将元组写入数组中
    return array
url='http://xxx.xxx.com/'
sum=pageid(url)
for i in sum:
    print(i)