#!/usr/bin/python3
# coding: utf-8
'''
@auth: levAndreev
@time: 2022/3/12
'''
import requests
from bs4 import BeautifulSoup
import numpy as np
import os
import sys
Hostreferer = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
}
def getHtml(url):
req = requests.get(url, headers=Hostreferer)
html = req.text
return html
def getp5(page, last):
ret = []
url = f'https://www.17500.cn/p5/all.php?p={page}'
html = getHtml(url)
soup = BeautifulSoup(html, 'html.parser')
trs = soup.findAll('tr', {'bgcolor': '#ffffff'})
if len(trs)<1: return ret, True
for it in trs:
if int(it.contents[0].text)>last:
# print(it.contents)
ret.insert(0,
it.contents[0].text+'\t'+
it.contents[1].text+'\t'+
it.contents[2].text+'\t'+
it.contents[3].text+'\t'+
it.contents[4].text+'\t'+
it.contents[5].text+'\n')
print(ret[0][:-1])
else: return ret, True
return ret, False
def lastSeq(path):
line = []
if not os.path.exists(path):
open(path, 'w').close()
return 0
with open(path, 'rb+') as f:
n=os.path.getsize(path)
if n>100: n=100
if n<3: return 0
f.seek(-n, os.SEEK_END)
line = f.readlines()
f.seek(-3, os.SEEK_END)
f.truncate()
last = int(str(line[-2][0:7], "utf-8"))
#print(last)
return last
def main(argv):
dest = './p5.txt'
if len(argv)>0: dest = argv[0]
data = []
last = lastSeq(dest)
for page in range(1,100):
ret, end = getp5(page, last)
data = ret + data
if end: break
suff = ''
if last == 0: suff = f'var history_data=`\n'
with open(dest,'a+') as f:
f.write(f'{suff}{"".join(data)}`;')
print(f'write to {dest} ok.')
main(sys.argv[1:])
python读取数据用例
于 2022-03-12 15:21:58 首次发布
这是一个Python爬虫脚本,使用requests和BeautifulSoup库从17500.cn网站抓取特定页面的数据。程序首先定义了请求头,然后通过getHtml函数获取HTML内容,接着解析并筛选出特定行的数据。数据存储到列表中,并最终写入到文件p5.txt。程序还包含了检查和更新最后序列号的功能,以避免重复抓取。
摘要由CSDN通过智能技术生成