爬虫

最新推荐文章于 2019-05-31 09:27:34 发布

学为好人

最新推荐文章于 2019-05-31 09:27:34 发布

阅读量166

点赞数

分类专栏：编程相关

本文链接：https://blog.csdn.net/wjbwjbwjbwjb/article/details/81747577

版权

编程相关专栏收录该内容

29 篇文章 2 订阅

订阅专栏

爬虫

# -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import time

def retrieve_href(root_url):
    response = request.urlopen(root_url)
    html = response.read()
    html = html.decode("utf-8")
    soup_texts = BeautifulSoup(html, 'lxml')
    infoText = soup_texts.find_all(class_ = 'infoText')
    infoText1 = BeautifulSoup(str(infoText), 'lxml')    
    num_pages = int(infoText1.i.text) // 12
    print(num_pages)
    list_href = []
    for i in range(num_pages):
        print('######################{}#######################'.format(i+1))
        url_list = "{}list_0_0_0_0_{}".format(root_url,i+1)
        print(url_list)
        response = request.urlopen(url_list)
        html = response.read()
        html = html.decode("utf-8")
        soup_texts = BeautifulSoup(html, 'lxml')
        texts = soup_texts.find_all(class_ = 'picList')
        soup_text = BeautifulSoup(str(texts), 'lxml')

        for div in soup_text.div:
            for each in div:
                texts2 = each.find_all(class_ = 'name')
                soup_text2 = BeautifulSoup(str(texts2), 'lxml')
                for a in soup_text2.find_all('a'):
                    list_href.append(a["href"])
                    print(a["href"])
    return list_href


def retrieve_info(page_url):
#    response = request.urlopen()
    try:  
        response = request.urlopen(page_url, timeout = 60)  
    except Exception as e:  
        error_info = "Request Error:{}".format(e)
        return error_info

    html = response.read()
    html = html.decode("utf-8")
    soup_texts = BeautifulSoup(html, 'lxml')
    texts = soup_texts.find_all(class_ = 'dataInfo')
    soup_text = BeautifulSoup(str(texts), 'lxml')
    text = soup_text.find_all('li')
    ret_info = '\n'
    for txt in text:
        info = txt.text.strip()
        if info.find('（1）')>=0:
            continue
        ret_info = ret_info + '\n{}'.format(info)
    return ret_info

if __name__ == "__main__":
    root_url = "http://weapon.huanqiu.com/weaponlist/artillery/"
    list_href = retrieve_href(root_url)
    print('######################ALL#######################')
    print(list_href)

    n = len(list_href)
    i = 0
    root_url = "http://weapon.huanqiu.com"
    with open('D:/artillery.txt', 'a', encoding='utf-8') as f:        
        for url_name in list_href:
            i = i+1
            if i <= 240:
                continue
            print('####################{}/{}#########################'.format(i,n))
            page_url = root_url + url_name
            ret_info = retrieve_info(page_url)            
            print(ret_info+'\n'+page_url)
            f.write(ret_info+'\n'+page_url)
            time.sleep(1)
    f.close()

学为好人

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫

爬虫# -*- coding: UTF-8 -*-from urllib import requestfrom bs4 import BeautifulSoupimport timedef retrieve_href(root_url): response = request.urlopen(root_url) html = response.read() h...
复制链接

扫一扫

专栏目录