#!/usr/bin/env python # encoding: utf-8 import urllib2 from lxml import etree import urllib import os from shutil import copyfile if __name__ == "__main__": # 首先请求爬取到所有页面中有图片的地方 keyword = raw_input("请输入需要搜索的内容:") kw = {"kw": keyword} kw = urllib.urlencode(kw) url = "http://tieba.baidu.com/f?" + kw header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} request = urllib2.Request(url) html = urllib2.urlopen(request).read() # 使用etree的方法解析成xml文件,然后用xpath的方法来解析,直接拿到图片后几位数字,拼接成链接 content = etree.HTML(html) link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') for link in link_list: fulllink = "http://tieba.baidu.com" + link # print fulllink
百度贴吧爬虫并保存至目录中
最新推荐文章于 2023-10-28 13:38:56 发布
![](https://img-home.csdnimg.cn/images/20240711042549.png)