[yum-commits] urlgrabber/grabber.py

skvidal at osuosl.org skvidal at osuosl.org
Thu Aug 6 19:22:54 UTC 2009


 urlgrabber/grabber.py |   98 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 35 deletions(-)

New commits:
commit 3957ce44ad7d224da3073f20e65de0209ff07b5e
Author: Seth Vidal <skvidal at fedoraproject.org>
Date:   Thu Aug 6 15:18:10 2009 -0400

    - fix intrrupt handler and document why keyboardinterrupt is going to be so weird in pycurl
    - disable signals and make sure we don't handle/intercept any in the pycurl code.
    - set 'check_timestamp' regets as NotImplemented. The work around is multiple connections.
      it is possible but not immediately useful since, afaict, NOTHING uses the check_timestamp regets.

diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index 3758799..e032135 100644
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -468,6 +468,13 @@ except AttributeError:
     TimeoutError = None
     have_socket_timeout = False
 
+try:
+    import signal
+    from signal import SIGPIPE, SIG_IGN
+    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
+except ImportError:
+    pass
+
 ########################################################################
 # functions for debugging output.  These functions are here because they
 # are also part of the module initialization.
@@ -1453,8 +1460,9 @@ class PyCurlFileObject():
         self.append = False
         self.reget_time = None
         self.opts = opts
+        if self.opts.reget == 'check_timestamp':
+            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
         self._complete = False
-        self.reget_time = None
         self._rbuf = ''
         self._rbufsize = 1024*8
         self._ttime = time.time()
@@ -1476,39 +1484,45 @@ class PyCurlFileObject():
         raise AttributeError, name
 
     def _retrieve(self, buf):
-        if not self._prog_running:
-            if self.opts.progress_obj:
-                size  = self.size + self._reget_length
-                self.opts.progress_obj.start(self._prog_reportname, 
-                                             urllib.unquote(self.url), 
-                                             self._prog_basename, 
-                                             size=size,
-                                             text=self.opts.text)
-                self._prog_running = True
-                self.opts.progress_obj.update(self._amount_read)
-
-        self._amount_read += len(buf)
-        self.fo.write(buf)
-        return len(buf)
-    
+        try:
+            if not self._prog_running:
+                if self.opts.progress_obj:
+                    size  = self.size + self._reget_length
+                    self.opts.progress_obj.start(self._prog_reportname, 
+                                                 urllib.unquote(self.url), 
+                                                 self._prog_basename, 
+                                                 size=size,
+                                                 text=self.opts.text)
+                    self._prog_running = True
+                    self.opts.progress_obj.update(self._amount_read)
+
+            self._amount_read += len(buf)
+            self.fo.write(buf)
+            return len(buf)
+        except KeyboardInterrupt:
+            return pycurl.READFUNC_ABORT
+            
     def _hdr_retrieve(self, buf):
-        self._hdr_dump += buf
-        # we have to get the size before we do the progress obj start
-        # but we can't do that w/o making it do 2 connects, which sucks
-        # so we cheat and stuff it in here in the hdr_retrieve
-        if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
-            length = buf.split(':')[1]
-            self.size = int(length)
-        elif self.scheme in ['ftp']:
-            s = None
-            if buf.startswith('213 '):
-                s = buf[3:].strip()
-            elif buf.startswith('150 '):
-                s = parse150(buf)
-            if s:
-                self.size = s
-        
-        return len(buf)
+        try:
+            self._hdr_dump += buf
+            # we have to get the size before we do the progress obj start
+            # but we can't do that w/o making it do 2 connects, which sucks
+            # so we cheat and stuff it in here in the hdr_retrieve
+            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
+                length = buf.split(':')[1]
+                self.size = int(length)
+            elif self.scheme in ['ftp']:
+                s = None
+                if buf.startswith('213 '):
+                    s = buf[3:].strip()
+                elif buf.startswith('150 '):
+                    s = parse150(buf)
+                if s:
+                    self.size = s
+            
+            return len(buf)
+        except KeyboardInterrupt:
+            return pycurl.READFUNC_ABORT
 
     def _return_hdr_obj(self):
         if self._parsed_hdr:
@@ -1531,6 +1545,7 @@ class PyCurlFileObject():
 
         # defaults we're always going to set
         self.curl_obj.setopt(pycurl.NOPROGRESS, False)
+        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
         self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
         self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
         self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
@@ -1610,11 +1625,23 @@ class PyCurlFileObject():
             # to other URLGrabErrors from 
             # http://curl.haxx.se/libcurl/c/libcurl-errors.html
             # this covers e.args[0] == 22 pretty well - which will be common
+            code = self.http_code                                
             if e.args[0] == 28:
                 err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
                 err.url = self.url
                 raise err
-            code = self.http_code                
+
+            elif e.args[0] == 23 and code >= 200 and code < 299:
+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+                err.url = self.url
+                # this is probably wrong but ultimately this is what happens
+                # we have a legit http code and a pycurl 'writer failed' code
+                # which almost always means something aborted it from outside
+                # since we cannot know what it is -I'm banking on it being
+                # a ctrl-c. XXXX - if there's a way of going back two raises to 
+                # figure out what aborted the pycurl process FIXME
+                raise KeyboardInterrupt
+
             if str(e.args[1]) == '': # fake it until you make it
                 msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
             else:
@@ -1623,7 +1650,7 @@ class PyCurlFileObject():
             err.code = code
             err.exception = e
             raise err
-            
+
     def _do_open(self):
         self.curl_obj = _curl_cache
         self.curl_obj.reset() # reset all old settings away, just in case
@@ -1842,6 +1869,7 @@ class PyCurlFileObject():
                 downloaded += self._reget_length
                 self.opts.progress_obj.update(downloaded)
 
+
     def read(self, amt=None):
         self._fill_buffer(amt)
         if amt is None:


More information about the Yum-commits mailing list