python读取数据用例2

maopaopao

已于 2022-09-13 13:56:27 修改

阅读量113

点赞数

于 2022-04-07 15:49:38 首次发布

本文链接：https://blog.csdn.net/maopaopao/article/details/124018355

版权

数据抓取彩票 17500 BeautifulSoup 数据分析

关键词由CSDN通过智能技术生成

#!/usr/bin/python3
# coding: utf-8
'''
@auth: levAndreev
@time: 2022/4/7
'''
import requests
from bs4 import BeautifulSoup
import numpy as np
import os
import sys
 
 
Hostreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
}
 
def getHtml(url):
    req = requests.get(url, headers=Hostreferer)
    html = req.text
    return html

def get17500(page, last):
    ret = []
    url = f'https://www.17500.cn/widget/_ssq/ssqfanjiang/p/{page}.html'
    html = getHtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    trs = soup.findAll('tr')
    if len(trs)>0: del(trs[0])
    if len(trs)>0: trs.pop()

    if len(trs)<1:
        return ret, True
 
    for it in trs:
#        print(it.contents)
#        continue
        if int(it.contents[0].text)>last:
            ret.insert(0,
                it.contents[0].text+'\t'+
                it.contents[3].text+'\n')
            print(ret[0][:-1])
        else:
            return ret, True
    return ret, False
 
def lastSeq(path):
    line = []
    if not os.path.exists(path):
        open(path, 'w').close()
        return 0
    with open(path, 'rb+') as f:
        n=os.path.getsize(path)
        if n>100: n=100
        if n<3: return 0
        f.seek(-n, os.SEEK_END)
        line = f.readlines()
        f.seek(-3, os.SEEK_END)
        f.truncate()
        last = int(str(line[-2][0:7], "utf-8"))
        #print(last)
        return last

def main(argv):
    dest = './d2.txt'
    if len(argv)>0: dest = argv[0]
    last = lastSeq(dest)
    data = []
    for page in range(1,100):
        ret, end = get17500(page, last)
        data = ret + data
        if end:
            break
 
    suff = ''
    if last == 0: suff = f'var history_data=`\n'
    if len(data)<1: data.insert(0, '\n')
    with open(dest,'a+') as f:
        f.write(f'{suff}{"".join(data)}`;')
 
    print(f'write to {dest} ok.')


main(sys.argv[1:])
##############################################
#!/usr/bin/python3
# coding: utf-8
import requests
from bs4 import BeautifulSoup
import numpy as np
import os
import sys
 
 
Hostreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
}
 
def getHtml(url):
    req = requests.get(url, headers=Hostreferer)
    html = req.text
    return html

s1=[1,9,10,19,20,29,30,33]
s2=[1,11,12,22,23,33]
def scope2(it):
    return ["3"][int(it)]
    return 0

def scope(it):
    for i in range(0,7,2):
        if s1[i]<=int(it) and s1[i+1]>=int(it):
#             print(f'[{s1[i]}, {s1[i+1]}] <= {it}')
             return int(i/2)
    return 0

def getitem(line):
    items = line.split(' ')[:-1]
    ret = [0,0,0,0]
    for it in items:
        s = scope(it)
        ret[s] = ret[s] + 1
    return ret

def get17500(page, last):
    ret = []
    url = f'https://www.17500.cn/widget/_ssq/ssqfanjiang/p/{page}.html'
    html = getHtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    trs = soup.findAll('tr')
    if len(trs)>0: del(trs[0])
    if len(trs)>0: trs.pop()

    if len(trs)<1:
        return ret, True
 
    for it in trs:
#        print(it.contents)
#        continue
        line = it.contents[3].text.replace('+', ' +')
        sc = ' '.join(str(i) for i in getitem(line))
#        print(getitem(line))
#        print(len(it.contents[3].text.split(' ')))
        if int(it.contents[0].text)>last:
            ret.insert(0,
                it.contents[0].text+'\t'+
                line+f' | {sc}\n')
        else:
            return ret, True
    return ret, False
 

def lastSeq2(path):
    if not os.path.exists(path):
        open(path, 'w').close()
        return 0
    with open(path, 'rb+') as f:
        n=os.path.getsize(path)
        if n>100: n=100
        if n<1: return 0
        f.seek(-n, os.SEEK_END)
        line = f.readlines()
        t=-3
        f.seek(t, os.SEEK_END)
        f.truncate()
        n = int(str(line[-2][0:7], 'utf-8'))
        return n
 
 
def main(argv):
    dest = './2d'
    if len(argv)>0: dest = argv[0]
    last = lastSeq2(dest)
    data = []
    for page in range(1,100):
        ret, end = get17500(page, last)
        data = ret + data
        if end: break

    suff = ''
    if last == 0: suff = f'var history_data=`\n'
    with open(dest,'a+') as f:
        f.write(f'{suff}{"".join(data)}`;')
 
    print(f'write to {dest} ok.')
 
main(sys.argv[1:])