[Yum-devel] [PATCH] add URLGrabberOptions.find_proxy()

Fri Mar 16 10:19:31 UTC 2012

Separate the proxy lookup and proxy setopt, so we can do it early.
This has the following benefits:

- When using parallel downloads, only a single process interfaces
  the libproxy library.

- The downloader does not have to deal with opts.proxies, elliminating
  the need to marshall dictionaries.

- Possible switch to a callback API instead of libproxy=ON/OFF.

It's tempting to call find_proxy() from URLParser.parse(), but
the user may(?) use his own opts.parser, so better not.
---
 urlgrabber/grabber.py |   68 ++++++++++++++++++++++++++++--------------------
 1 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index 512ba0b..7224a55 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -815,6 +815,40 @@ class URLGrabberOptions:
         else: # throttle is a float
             return self.bandwidth * self.throttle
         
+    def find_proxy(self, url, scheme):
+        """Find the proxy to use for this URL.
+        Use the proxies dictionary first, then libproxy.
+        """
+        self.proxy = None
+        if scheme not in ('ftp', 'http', 'https'):
+            return
+
+        if self.proxies:
+            proxy = self.proxies.get(scheme)
+            if proxy is None:
+                if scheme == 'http':
+                    proxy = self.proxies.get('https')
+                elif scheme == 'https':
+                    proxy = self.proxies.get('http')
+            if proxy != '_none_':
+                self.proxy = proxy
+            return
+
+        if self.libproxy:
+            global _libproxy_cache
+            if _libproxy_cache is None:
+                try:
+                    import libproxy
+                    _libproxy_cache = libproxy.ProxyFactory()
+                except:
+                    _libproxy_cache = False
+            if _libproxy_cache:
+                for proxy in _libproxy_cache.getProxies(url):
+                    if proxy.startswith('http://'):
+                        if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
+                        self.proxy = proxy
+                        break
+
     def derive(self, **kwargs):
         """Create a derived URLGrabberOptions instance.
         This method creates a new instance and overrides the
@@ -967,6 +1001,7 @@ class URLGrabber(object):
         opts = self.opts.derive(**kwargs)
         if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
+        opts.find_proxy(url, parts[0])
         def retryfunc(opts, url):
             return PyCurlFileObject(url, filename=None, opts=opts)
         return self._retry(opts, retryfunc, url)
@@ -982,6 +1017,7 @@ class URLGrabber(object):
         if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
         (scheme, host, path, parm, query, frag) = parts
+        opts.find_proxy(url, scheme)
         if filename is None:
             filename = os.path.basename( urllib.unquote(path) )
             if not filename:
@@ -1042,6 +1078,7 @@ class URLGrabber(object):
         opts = self.opts.derive(**kwargs)
         if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
         (url,parts) = opts.urlparser.parse(url, opts) 
+        opts.find_proxy(url, parts[0])
         if limit is not None:
             limit = limit + 1
             
@@ -1280,34 +1317,9 @@ class PyCurlFileObject(object):
         if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
             self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
             
-        # proxy settings
-        proxy = None
-        if opts.proxies and self.scheme in ('ftp', 'http', 'https'):
-            # use proxies dict
-            proxy = opts.proxies.get(self.scheme)
-            if proxy is None:
-                if self.scheme == 'http':
-                    proxy = opts.proxies.get('https')
-                elif self.scheme == 'https':
-                    proxy = opts.proxies.get('http')
-        elif opts.libproxy:
-            # import libproxy
-            global _libproxy_cache
-            if _libproxy_cache is None:
-                try:
-                    import libproxy
-                    _libproxy_cache = libproxy.ProxyFactory()
-                except ImportError:
-                    _libproxy_cache = False
-            # use if available
-            if _libproxy_cache:
-                for a_proxy in _libproxy_cache.getProxies(self.url):
-                    if a_proxy.startswith('http://'):
-                        proxy = a_proxy
-                        if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, self.url))
-                        break
-        if proxy and proxy != '_none_':
-            self.curl_obj.setopt(pycurl.PROXY, proxy)
+        # proxy
+        if opts.proxy:
+            self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
             self.curl_obj.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
 
         if opts.username and opts.password:
-- 
1.7.4.4