Searching in baidu using BeautifulSoup in Python3.3

最新推荐文章于 2024-08-13 14:34:06 发布

老徐WEB

最新推荐文章于 2024-08-13 14:34:06 发布

阅读量448

点赞数

分类专栏： python 文章标签： Python 3.3 BeautifulSoup

本文链接：https://blog.csdn.net/uvyoaa/article/details/51463169

版权

python 专栏收录该内容

33 篇文章 6 订阅

订阅专栏

 # -*- coding: utf-8 -*- 
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
import json
import datetime


class BaiduCrawler:
    link = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidulocal&wd='
    def __init__(self,inter_page,target,inter_days):
        self.inter_page = inter_page
        self.inter_days = datetime.timedelta(days=int(inter_days))
        self.target = target
        '''
        #PROXY
        proxy = req.ProxyHandler({'http': '127.0.0.1:3128'})
        opener = req.build_opener(proxy)
        req.install_opener(opener)
        '''
    def get_threads(self,soup):
        text = soup.find_all('td',{'class':'f'})
        return text
    
    def baidu_search(self,page,game_name):
        pn = page*10
        print('pn is ',pn)
        url = self.link + game_name + '&pn='+str(pn)
        print('url is ' ,url)
        conn = req.urlopen(url)
        text = conn.read().decode()
        soup = BeautifulSoup(text,'html.parser')
        return soup
    
    def crawler(self):
        today = datetime.date.today()
        
        name = urllib.parse.quote(self.target)
        dfs = []
        tmp_list = []
        for page in range(self.inter_page):
            soup = self.baidu_search(page,name)
            threads = self.get_threads(soup)
            for thread in threads:
                up_date = str(thread.select('font')[-1].text.split('\xa0')[-2])
                dt = datetime.datetime.strptime(up_date,'%Y-%m-%d')
                dt = datetime.date(dt.year,dt.month,dt.day)
                if(today-dt <= self.inter_days):
                    tmp_list.append((thread.select('a')[0].text,thread.select('a')[0]['href']))
        df = pd.DataFrame.from_dict(tmp_list)
        df.columns = ['title','url']
        dfs.append(df) 
        final_df = pd.concat(dfs) 
        print('Saving file ...')
        df.to_csv(path_or_buf = 'd:\\tmp\\baidu.csv',encoding='utf-8') 
        print('It is OK!')  
  
def baidu():
    page = 1 #how many page do you need.
    search = 'xxxx' # something you want to search 
    days = 30 # in how many days
    baidu = BaiduCrawler(page,search,days)
    baidu.crawler()
    
if __name__ == '__main__':
    baidu()