python的爬虫代码_python爬虫代码

#encoding=utf-8

importreimportrequestsimporturllib2importdatetimeimportMySQLdbfrom bs4 importBeautifulSoupimportsys

reload(sys)

sys.setdefaultencoding("utf-8")classSplider(object):def __init__(self):print u‘开始爬取内容...‘

##用来获取网页源代码

defgetsource(self,url):

headers= {‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2652.0 Safari/537.36‘}

req= urllib2.Request(url=url,headers=headers)

socket=urllib2.urlopen(req)

content=socket.read()

socket.close()returncontent##changepage用来生产不同页数的链接

defchangepage(self,url,total_page):

now_page= int(re.search(‘page/(\d+)‘,url,re.S).group(1))

page_group=[]for i in range(now_page,total_page+1):

link= re.sub(‘page/(\d+)‘,‘page/%d‘ %i,url,re.S)

page_group.append(link)returnpage_group#获取字内容

defgetchildrencon(self,child_url):

conobj={}

content=self.getsource(child_url)

soup= BeautifulSoup(content, ‘html.parser‘, from_encoding=‘utf-8‘)

content= soup.find(‘div‘,{‘class‘:‘c-article_content‘})

img= re.findall(‘src="(.*?)"‘,str(content),re.S)

conobj[‘con‘] =content.get_text()

conobj[‘img‘] = (‘;‘).join(img)returnconobj##获取内容

defgetcontent(self,html_doc):

soup= BeautifulSoup(html_doc, ‘html.parser‘, from_encoding=‘utf-8‘)

tag= soup.find_all(‘div‘,{‘class‘:‘promo-feed-headline‘})

info={}

i=0for link intag:

info[i]={}

title_desc= link.find(‘h3‘)

info[i][‘title‘] =title_desc.get_text()

post_date= link.find(‘div‘,{‘class‘:‘post-date‘})

pos_d= post_date[‘data-date‘][0:10]

info[i][‘content_time‘] =pos_d

info[i][‘source‘] = ‘whowhatwear‘source_link= link.find(‘a‘,href=re.compile(r"section=fashion-trends"))

source_url= ‘http://www.whowhatwear.com‘+source_link[‘href‘]

info[i][‘source_url‘] =source_url

in_content=self.getsource(source_url)

in_soup= BeautifulSoup(in_content, ‘html.parser‘, from_encoding=‘utf-8‘)

soup_content= in_soup.find(‘section‘,{‘class‘:‘widgets-list-content‘})

info[i][‘content‘] = soup_content.get_text().strip(‘\n‘)

text_con= in_soup.find(‘section‘,{‘class‘:‘text‘})

summary= text_con.get_text().strip(‘\n‘) if text_con.text != None elseNULL

info[i][‘summary‘] = summary[0:200]+‘...‘;

img_list= re.findall(‘src="(.*?)"‘,str(soup_content),re.S)

info[i][‘imgs‘] = (‘;‘).join(img_list)

info[i][‘create_time‘] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

i+=1

#print info

#exit()

returninfodefsaveinfo(self,content_info):

conn= MySQLdb.Connect(host=‘127.0.0.1‘,user=‘root‘,passwd=‘123456‘,port=3306,db=‘test‘,charset=‘utf8‘)

cursor=conn.cursor()for each incontent_info:for k,v ineach.items():

sql= "insert into t_fashion_spider2(`title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`) values (‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % (MySQLdb.escape_string(v[‘title‘]),MySQLdb.escape_string(v[‘summary‘]),MySQLdb.escape_string(v[‘content‘]),v[‘content_time‘],v[‘imgs‘],v[‘source‘],v[‘source_url‘],v[‘create_time‘])

cursor.execute(sql)

conn.commit()

cursor.close()

conn.close()if __name__ == ‘__main__‘:

classinfo=[]

p_num= 5url= ‘http://www.whowhatwear.com/section/fashion-trends/page/1‘jikesplider=Splider()

all_links=jikesplider.changepage(url,p_num)for link inall_links:print u‘正在处理页面:‘ +link

html=jikesplider.getsource(link)

info =jikesplider.getcontent(html)

classinfo.append(info)

jikesplider.saveinfo(classinfo)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值