python requests补充

最新推荐文章于 2023-12-18 14:21:28 发布

lvbu89757

最新推荐文章于 2023-12-18 14:21:28 发布

阅读量129

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/lvbu89757/article/details/96838187

版权

爬虫专栏收录该内容

9 篇文章 0 订阅

订阅专栏

python requests上代理服务器

# 根据协议类型，选择不同的代理
proxies = {"http": "http://148.399.56.79:9527"}

response = requests.get("http://www.baidu.com", proxies = proxies)
# 私密代理，代表用户名与密码

proxy = { "http": "xx:123456@148.399.56.79:9527" }

response = requests.get("http://www.baidu.com", proxies = proxy)

多线程

    import requests
    import time
    from Queue import Queue
    from lxml import etree

    import threading

    class Douban(object):
        def __init__(self):
            self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
            self.base_url = "https://movie.douban.com/top250?start="
            self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
            # 创建保存数据的队列
            self.data_queue = Queue()
            self.count = 0

        def send_request(self, url):
            print "[INFO]: 正在抓取" + url
            html = requests.get(url, headers = self.headers).content
            # 每次请求间隔1秒
            time.sleep(1)
            self.parse_page(html)


        def parse_page(self, html):
            html_obj = etree.HTML(html)

            node_list = html_obj.xpath("//div[@class='info']")

            for node in node_list:
                # 电影标题
                title = node.xpath("./div[@class='hd']/a/span[1]/text()")[0]
                # 电影评分
                score = node.xpath(".//span[@class='rating_num']/text()")[0]
                self.count += 1
                self.data_queue.put(score + "\t" + title)


        def start_work(self):

            # 单线程：
            """
            for url in self.url_list:
                self.send_request(url)
            """

            thread_list = []
            for url in self.url_list:
                # 创建一个线程对象
                thread = threading.Thread(target = self.send_request, args = [url])
                # 启动线程，执行任务
                thread.start()
                # 将当前线程对象存到列表
                thread_list.append(thread)

            # 让主线程等待，所有子线程执行结束，再执行后面的代码
            for thread in thread_list:
                thread.join()


            while not self.data_queue.empty():
                print self.data_queue.get()

            print self.count

    if __name__ == "__main__":
        douban = Douban()
        start = time.time()
        douban.start_work()

        print "[INFO]: Useing time %f secend" % (time.time() - start)
        # [INFO]: Useing time 1.483035 secend

lvbu89757

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python requests补充

python requests上代理服务器# 根据协议类型，选择不同的代理proxies = {"http": "http://148.399.56.79:9527"}response = requests.get("http://www.baidu.com", proxies = proxies)# 私密代理，代表用户名与密码proxy = { "http": "xx:123456...
复制链接

扫一扫