python 未知

最新推荐文章于 2024-08-26 18:56:03 发布

bakengsan7666

最新推荐文章于 2024-08-26 18:56:03 发布

阅读量107

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/Justice-V/p/8157180.html

版权

import time
import requests
from bs4 import BeautifulSoup
import threading


def format_str(s):
    return s.replace("\n","").replace("","").replace("\t",'')



def get_urls_in_pages(from_page_num,to_page_num):
    urls=[]
    search_word='计算机'
    url_part_1='http://www.phei.com.cn/moudle/goods/'\
               'searchkey.jsp? Page='
    url_part_2='&Page=2&searchKey='
    for i in range(from_page_num,to_page_num+1):
        urls.append(url_part_1
                    +str(i)+
                    url_part_2+search_word)
    all_href_list=[]
    for url in urls:
        print(url)
        resp=requests.get(url)
        bs=BeautifulSoup(resp.text)
        a_list=bs.find_all('a')
        needed_list=[]
        for a in a_list:
            if 'href'in a.attrs:
                href_val=a['href']
                title=a.text
                if 'bookid'in href_val and 'shopcar0.jsp'\
                             not in href_val and title !='':
                    if [title,href_val] not in needed_list:
                        needed_list.append([format_str(title),
                                            format_str(href_val)])
        all_href_list+=needed_list
    all_href_file=open(str(from_page_num)+'_'+
                     str(to_page_num)+'_'+
                     'all_hrefs.txt','w')
    for href in all_href_list:
        all_href_file.write('\t'.join(href)+'\n')
    all_href_file.close()
    print(from_page_num,to_page_num,len(all_href_list))

转载于:https://www.cnblogs.com/Justice-V/p/8157180.html

bakengsan7666

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 未知

import timeimport requestsfrom bs4 import BeautifulSoupimport threadingdef format_str(s): return s.replace("\n","").replace("","").replace("\t",'')def get_urls_in_pages(from_page_num,to_page_n...
复制链接

扫一扫