Searching in baidu using BeautifulSoup in Python3.3


 # -*- coding: utf-8 -*- 
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
import json
import datetime


class BaiduCrawler:
    link = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidulocal&wd='
    def __init__(self,inter_page,target,inter_days):
        self.inter_page = inter_page
        self.inter_days = datetime.timedelta(days=int(inter_days))
        self.target = target
        '''
        #PROXY
        proxy = req.ProxyHandler({'http': '127.0.0.1:3128'})
        opener = req.build_opener(proxy)
        req.install_opener(opener)
        '''
    def get_threads(self,soup):
        text = soup.find_all('td',{'class':'f'})
        return text
    
    def baidu_search(self,page,game_name):
        pn = page*10
        print('pn is ',pn)
        url = self.link + game_name + '&pn='+str(pn)
        print('url is ' ,url)
        conn = req.urlopen(url)
        text = conn.read().decode()
        soup = BeautifulSoup(text,'html.parser')
        return soup
    
    def crawler(self):
        today = datetime.date.today()
        
        name = urllib.parse.quote(self.target)
        dfs = []
        tmp_list = []
        for page in range(self.inter_page):
            soup = self.baidu_search(page,name)
            threads = self.get_threads(soup)
            for thread in threads:
                up_date = str(thread.select('font')[-1].text.split('\xa0')[-2])
                dt = datetime.datetime.strptime(up_date,'%Y-%m-%d')
                dt = datetime.date(dt.year,dt.month,dt.day)
                if(today-dt <= self.inter_days):
                    tmp_list.append((thread.select('a')[0].text,thread.select('a')[0]['href']))
        df = pd.DataFrame.from_dict(tmp_list)
        df.columns = ['title','url']
        dfs.append(df) 
        final_df = pd.concat(dfs) 
        print('Saving file ...')
        df.to_csv(path_or_buf = 'd:\\tmp\\baidu.csv',encoding='utf-8') 
        print('It is OK!')  
  
def baidu():
    page = 1 #how many page do you need.
    search = 'xxxx' # something you want to search 
    days = 30 # in how many days
    baidu = BaiduCrawler(page,search,days)
    baidu.crawler()
    
if __name__ == '__main__':
    baidu()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值