爬取58同城商品页/time.sleep()反爬

最新推荐文章于 2024-06-27 18:16:02 发布

灵剑山真人

最新推荐文章于 2024-06-27 18:16:02 发布

阅读量1.1k

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/weixin_45850939/article/details/105055335

版权

爬虫——大斗师专栏收录该内容

8 篇文章 0 订阅

订阅专栏

from bs4 import BeautifulSoup
import requests
import time
import json

class Spider():
    def __init__(self):
        self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
         'Cookie':'userid360_xml=DA351ED943B7D514950F847645DEB635; time_create=1587472887135; f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=mm%7C%E8%8C%82%E5%90%8D%7C0; userid360_xml=DA351ED943B7D514950F847645DEB635; time_create=1587473108637; f=n; id58=c5/nn13Q11MweK6PMRylAg==; 58home=bj; city=bj; 58tj_uuid=7a6baace-998a-4b1b-8d83-4782fb6c0b66; new_uv=3; als=0; wmda_uuid=9dd05d0a6a6044021be1a1c6bab10d71; wmda_new_uuid=1; wmda_visited_projects=%3B11187958619315%3B1409632296065; xxzl_deviceid=5NSN2EJuLzsmCK8%2B7ZToyx1eJnR68wAN23F3N%2B%2FL9iJZc%2BYsmps2L%2BNMPAsT1gtT; gr_user_id=81fd7291-1408-4a46-8da0-5b6c7eed6f6b; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1584880886; Hm_lvt_e15962162366a86a6229038443847be7=1584880886; GA_GTID=0d3628eb-002a-7b7c-5add-41dea874a818; _ga=GA1.2.1565386338.1584881078; _gid=GA1.2.1326103577.1584881078; Hm_lvt_e2d6b2d0ec536275bb1e37b421085803=1584881379; final_history=41582026010401; ppStore_fingerprint=F794BAF2D2A55929EC6A17AC7BEB2196216B781C04FD13BE%EF%BC%BF1584881381109; sessionid=5ab38641-633f-4619-85c4-65ae47bef1e6; Hm_lpvt_3bb04d7a4ca3846dcc66a99c3e861511=1584945621; Hm_lpvt_e15962162366a86a6229038443847be7=1584945621; f=n; xzfzqtoken=JprXuyYTloLFy4BngTOYF%2F%2B2mutsItJBcFI5zzLh7sI19IE7oXs5JkMDG6tsfN%2Bxin35brBb%2F%2FeSODvMgkQULA%3D%3D; zscom=%5B0%2C0%2C0%2C0%5D; Hm_lpvt_e2d6b2d0ec536275bb1e37b421085803=1584881379; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; new_session=1; utm_source=; spm=; init_refer=https%253A%252F%252Fcallback.58.com%252Fantibot%252Fverifycode%253FserialId%253Db607ceb049f6a879564cdce058aaeceb_c0901def04c343378eceba20f8fac405%2526code%253D21%2526sign%253Dd97c2ace4d7cc3274bbf39d667b15f06%2526namespace%253Dhuangyelistpc%2526url%253Dhttps%25253A%25252F%25252Finfo5.58.com%25252Fbj%25252Fpbdn%25252F0%25252F; commontopbar_ipcity=mm%7C%E8%8C%82%E5%90%8D%7C0; wmda_session_id_11187958619315=1584948561215-5519ce6b-e294-476a; xxzl_cid=2e5a615f7295460593a6e98a49ea5745; xzuid=9d64ff8d-b9c4-49dd-91dd-4836ddadb433'
        }
        self.urls=['https://bj.58.com/pbdn/0/pn{}'.format(str(i))for i in range(1,12)]
        self.s=requests.Session()
        self.url_list=[]
        self.href_list=[]
        self.things_information=[]

    def __get_urls(self):
        '''
        获取每一页含有商品链接的字符串
        '''
        for url in self.urls:
            html=self.s.get(url,headers=self.headers)
            soup=BeautifulSoup(html.text,'lxml')
            a_page_urls=soup.select('tr.ac_item > td[colspan="3"]:nth-child(1) > a:nth-child(1)')
            self.url_list.append(a_page_urls)
            time.sleep(20)#每一次请求间隔20秒
            print('success')

    def __get_src(self):
        '''
        从每一个字符串中获取商品链接
        '''
        for a_page_urls in self.url_list:
            for href in a_page_urls:
                href_need=href.get('href')
                self.href_list.append(href_need)
    
    def __get_things_information(self):
        '''
        从商品链接中获取我们要的信息
        '''
        for href in self.href_list:
            html=self.s.get(href)
            soup=BeautifulSoup(html.text,'lxml')

            sort=soup.select('.crb_a_2')
            title=soup.select('.detail-title__name')
            date=soup.select('div.detail-title__info__text:nth-child(1)')
            money=soup.select('.infocard__container__item__main__text--price')
            place=soup.select('div.infocard__container__item:nth-child(2) > div:nth-child(2) > a:nth-child(1)')

            try:
                dict={'sort':sort[0].get_text(),
                    'title':title[0].get_text(),
                    'date':date[0].get_text(),
                    'money':money[0].get_text(),
                    'place':place[0].get_text()
                }
            except:
                pass
            else:
                self.things_information.append(dict)

    def __write_information(self):
        '''
        把信息写入文件
        '''
        with open('data.json','w',encoding='utf-8') as file:
            file.write(json.dumps(self.things_information,indent=2,ensure_ascii=False))

    def go(self):
        print('start')
        self.__get_urls()
        self.__get_src()
        self.__get_things_information()
        self.__write_information()
        print('end')

spider=Spider()
spider.go()

注意：'sort':sort[0].get_text() sort的类型是列表。

td[colspan="3"] colspan是属性名，具体值要用双引号。

请求太快会遭遇反爬，封IP，暂时还不会代理IP池，只能设置时间间隔了。

阅读量是JS控制，还不会爬取。