lz是爬虫初学者,在第一次实践过程中遇到好几次报错,调试多遍,最后报错固定在KeyboardInterrupt。
爬取的网址是螺纹钢实时报价 - 生意社期现实时报价系统,需求是将4页的数据中的价格和时间保存在excel文件的同一张表中。
代码如下:
import requests import logging import re import pandas as pd import os logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') BASE_URL = 'https://pdata.100ppi.com/?f=basket&dir=futures&id=927&p=' TOTAL_PAGE = 4 def scrape_page(url): # 一般的返回HTML代码的方法 logging.info('scraping %s...', url) try: response = requests.get(url) if response.status_code == 200: return response.text logging.error('get invalid status code %s while scraping %s', response.status_code, url) except requests.RequestException: logging.error('error occured while scraping %s', url, exc_info=True, stack_info=True) def scrape_index(page): # 通过拼接,返回所需爬取的4个页面的Html代码 index_url = f'{BASE_URL}/{page}/#futures_927' return scrape_page(index_url) def parse_detail(html): time_pattern = re.compile(r'(<td>\d{4}-\d{2}-\d{2}\s*?\d{2}:\d{2})</td>') time = re.findall(time_pattern, html) if (re. findall(time_pattern, html)) else [] price_pattern = re.compile(r'<td>d{4}元/吨\s*?</td>') price = re.findall(price_pattern, html) if (re. findall(price_pattern, html)) else [] return pd.DataFrame({'time': time, 'Price': price}) def save_data(data): if not os.path.exists('rbdata.xlsx'): with pd.ExcelWriter('rbdata.xlsx', engine='xlsxwriter') as writer: data.to_excel(writer, index=False) else: # 如果 Excel 文件已存在,则直接追加数据到现有工作表中 with pd.ExcelWriter('rbdata.xlsx', engine='xlsxwriter') as writer: data.to_excel(writer, index=False, header=False) def main(): for page in range(1, TOTAL_PAGE + 1): # 得到page1-4 detail_htmls = scrape_index(page) for detail_html in detail_htmls: data = parse_detail(detail_html) logging.info('get detail data %s', data) save_data(data) logging.info('data saved successfully') if __name__ == '__main__': main()
改成以上后不知道怎么继续改了,程序可以执行,但是生成的表要么是损坏要么是没有数据,问GPT也没用,球球好心人指教