# -*- coding: utf-8 -*-
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
import json
import datetime
class BaiduCrawler:
link = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidulocal&wd='
def __init__(self,inter_page,target,inter_days):
self.inter_page = inter_page
self.inter_days = datetime.timedelta(days=int(inter_days))
self.target = target
'''
#PROXY
proxy = req.ProxyHandler({'http': '127.0.0.1:3128'})
opener = req.build_opener(proxy)
req.install_opener(opener)
'''
def get_threads(self,soup):
text = soup.find_all('td',{'class':'f'})
return text
def baidu_search(self,page,game_name):
pn = page*10
print('pn is ',pn)
url = self.link + game_name + '&pn='+str(pn)
print('url is ' ,url)
conn = req.urlopen(url)
text = conn.read().decode()
soup = BeautifulSoup(text,'html.parser')
return soup
def crawler(self):
today = datetime.date.today()
name = urllib.parse.quote(self.target)
dfs = []
tmp_list = []
for page in range(self.inter_page):
soup = self.baidu_search(page,name)
threads = self.get_threads(soup)
for thread in threads:
up_date = str(thread.select('font')[-1].text.split('\xa0')[-2])
dt = datetime.datetime.strptime(up_date,'%Y-%m-%d')
dt = datetime.date(dt.year,dt.month,dt.day)
if(today-dt <= self.inter_days):
tmp_list.append((thread.select('a')[0].text,thread.select('a')[0]['href']))
df = pd.DataFrame.from_dict(tmp_list)
df.columns = ['title','url']
dfs.append(df)
final_df = pd.concat(dfs)
print('Saving file ...')
df.to_csv(path_or_buf = 'd:\\tmp\\baidu.csv',encoding='utf-8')
print('It is OK!')
def baidu():
page = 1 #how many page do you need.
search = 'xxxx' # something you want to search
days = 30 # in how many days
baidu = BaiduCrawler(page,search,days)
baidu.crawler()
if __name__ == '__main__':
baidu()