爬虫

爬虫

# -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import time

def retrieve_href(root_url):
    response = request.urlopen(root_url)
    html = response.read()
    html = html.decode("utf-8")
    soup_texts = BeautifulSoup(html, 'lxml')
    infoText = soup_texts.find_all(class_ = 'infoText')
    infoText1 = BeautifulSoup(str(infoText), 'lxml')    
    num_pages = int(infoText1.i.text) // 12
    print(num_pages)
    list_href = []
    for i in range(num_pages):
        print('######################{}#######################'.format(i+1))
        url_list = "{}list_0_0_0_0_{}".format(root_url,i+1)
        print(url_list)
        response = request.urlopen(url_list)
        html = response.read()
        html = html.decode("utf-8")
        soup_texts = BeautifulSoup(html, 'lxml')
        texts = soup_texts.find_all(class_ = 'picList')
        soup_text = BeautifulSoup(str(texts), 'lxml')

        for div in soup_text.div:
            for each in div:
                texts2 = each.find_all(class_ = 'name')
                soup_text2 = BeautifulSoup(str(texts2), 'lxml')
                for a in soup_text2.find_all('a'):
                    list_href.append(a["href"])
                    print(a["href"])
    return list_href


def retrieve_info(page_url):
#    response = request.urlopen()
    try:  
        response = request.urlopen(page_url, timeout = 60)  
    except Exception as e:  
        error_info = "Request Error:{}".format(e)
        return error_info

    html = response.read()
    html = html.decode("utf-8")
    soup_texts = BeautifulSoup(html, 'lxml')
    texts = soup_texts.find_all(class_ = 'dataInfo')
    soup_text = BeautifulSoup(str(texts), 'lxml')
    text = soup_text.find_all('li')
    ret_info = '\n'
    for txt in text:
        info = txt.text.strip()
        if info.find('(1)')>=0:
            continue
        ret_info = ret_info + '\n{}'.format(info)
    return ret_info

if __name__ == "__main__":
    root_url = "http://weapon.huanqiu.com/weaponlist/artillery/"
    list_href = retrieve_href(root_url)
    print('######################ALL#######################')
    print(list_href)

    n = len(list_href)
    i = 0
    root_url = "http://weapon.huanqiu.com"
    with open('D:/artillery.txt', 'a', encoding='utf-8') as f:        
        for url_name in list_href:
            i = i+1
            if i <= 240:
                continue
            print('####################{}/{}#########################'.format(i,n))
            page_url = root_url + url_name
            ret_info = retrieve_info(page_url)            
            print(ret_info+'\n'+page_url)
            f.write(ret_info+'\n'+page_url)
            time.sleep(1)
    f.close()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值