l4/pkg/python/contrib/Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import time
  29 import sys
  30 from urlparse import urljoin as basejoin
  31 import warnings
  32
  33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  34            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  35            "urlencode", "url2pathname", "pathname2url", "splittag",
  36            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  37            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  38            "splitnport", "splitquery", "splitattr", "splitvalue",
  39            "getproxies"]
  40
  41 __version__ = '1.17'    # XXX This version is not always updated :-(
  42
  43 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  44
  45 # Helper for non-unix systems
  46 if os.name == 'mac':
  47     from macurl2path import url2pathname, pathname2url
  48 elif os.name == 'nt':
  49     from nturl2path import url2pathname, pathname2url
  50 elif os.name == 'riscos':
  51     from rourl2path import url2pathname, pathname2url
  52 else:
  53     def url2pathname(pathname):
  54         """OS-specific conversion from a relative URL of the 'file' scheme
  55         to a file system path; not recommended for general use."""
  56         return unquote(pathname)
  57
  58     def pathname2url(pathname):
  59         """OS-specific conversion from a file system path to a relative URL
  60         of the 'file' scheme; not recommended for general use."""
  61         return quote(pathname)
  62
  63 # This really consists of two pieces:
  64 # (1) a class which handles opening of all sorts of URLs
  65 #     (plus assorted utilities etc.)
  66 # (2) a set of functions for parsing URLs
  67 # XXX Should these be separated out into different modules?
  68
  69
  70 # Shortcut for basic usage
  71 _urlopener = None
  72 def urlopen(url, data=None, proxies=None):
  73     """Create a file-like object for the specified URL to read from."""
  74     from warnings import warnpy3k
  75     warnings.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
  76                         "favor of urllib2.urlopen()", stacklevel=2)
  77
  78     global _urlopener
  79     if proxies is not None:
  80         opener = FancyURLopener(proxies=proxies)
  81     elif not _urlopener:
  82         opener = FancyURLopener()
  83         _urlopener = opener
  84     else:
  85         opener = _urlopener
  86     if data is None:
  87         return opener.open(url)
  88     else:
  89         return opener.open(url, data)
  90 def urlretrieve(url, filename=None, reporthook=None, data=None):
  91     global _urlopener
  92     if not _urlopener:
  93         _urlopener = FancyURLopener()
  94     return _urlopener.retrieve(url, filename, reporthook, data)
  95 def urlcleanup():
  96     if _urlopener:
  97         _urlopener.cleanup()
  98
  99 # check for SSL
 100 try:
 101     import ssl
 102 except:
 103     _have_ssl = False
 104 else:
 105     _have_ssl = True
 106
 107 # exception raised when downloaded size does not match content-length
 108 class ContentTooShortError(IOError):
 109     def __init__(self, message, content):
 110         IOError.__init__(self, message)
 111         self.content = content
 112
 113 ftpcache = {}
 114 class URLopener:
 115     """Class to open URLs.
 116     This is a class rather than just a subroutine because we may need
 117     more than one set of global protocol-specific options.
 118     Note -- this is a base class for those who don't want the
 119     automatic handling of errors type 302 (relocated) and 401
 120     (authorization needed)."""
 121
 122     __tempfiles = None
 123
 124     version = "Python-urllib/%s" % __version__
 125
 126     # Constructor
 127     def __init__(self, proxies=None, **x509):
 128         if proxies is None:
 129             proxies = getproxies()
 130         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 131         self.proxies = proxies
 132         self.key_file = x509.get('key_file')
 133         self.cert_file = x509.get('cert_file')
 134         self.addheaders = [('User-Agent', self.version)]
 135         self.__tempfiles = []
 136         self.__unlink = os.unlink # See cleanup()
 137         self.tempcache = None
 138         # Undocumented feature: if you assign {} to tempcache,
 139         # it is used to cache files retrieved with
 140         # self.retrieve().  This is not enabled by default
 141         # since it does not work for changing documents (and I
 142         # haven't got the logic to check expiration headers
 143         # yet).
 144         self.ftpcache = ftpcache
 145         # Undocumented feature: you can use a different
 146         # ftp cache by assigning to the .ftpcache member;
 147         # in case you want logically independent URL openers
 148         # XXX This is not threadsafe.  Bah.
 149
 150     def __del__(self):
 151         self.close()
 152
 153     def close(self):
 154         self.cleanup()
 155
 156     def cleanup(self):
 157         # This code sometimes runs when the rest of this module
 158         # has already been deleted, so it can't use any globals
 159         # or import anything.
 160         if self.__tempfiles:
 161             for file in self.__tempfiles:
 162                 try:
 163                     self.__unlink(file)
 164                 except OSError:
 165                     pass
 166             del self.__tempfiles[:]
 167         if self.tempcache:
 168             self.tempcache.clear()
 169
 170     def addheader(self, *args):
 171         """Add a header to be used by the HTTP interface only
 172         e.g. u.addheader('Accept', 'sound/basic')"""
 173         self.addheaders.append(args)
 174
 175     # External interface
 176     def open(self, fullurl, data=None):
 177         """Use URLopener().open(file) instead of open(file, 'r')."""
 178         fullurl = unwrap(toBytes(fullurl))
 179         # percent encode url. fixing lame server errors like space within url
 180         # parts
 181         fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
 182         if self.tempcache and fullurl in self.tempcache:
 183             filename, headers = self.tempcache[fullurl]
 184             fp = open(filename, 'rb')
 185             return addinfourl(fp, headers, fullurl)
 186         urltype, url = splittype(fullurl)
 187         if not urltype:
 188             urltype = 'file'
 189         if urltype in self.proxies:
 190             proxy = self.proxies[urltype]
 191             urltype, proxyhost = splittype(proxy)
 192             host, selector = splithost(proxyhost)
 193             url = (host, fullurl) # Signal special case to open_*()
 194         else:
 195             proxy = None
 196         name = 'open_' + urltype
 197         self.type = urltype
 198         name = name.replace('-', '_')
 199         if not hasattr(self, name):
 200             if proxy:
 201                 return self.open_unknown_proxy(proxy, fullurl, data)
 202             else:
 203                 return self.open_unknown(fullurl, data)
 204         try:
 205             if data is None:
 206                 return getattr(self, name)(url)
 207             else:
 208                 return getattr(self, name)(url, data)
 209         except socket.error, msg:
 210             raise IOError, ('socket error', msg), sys.exc_info()[2]
 211
 212     def open_unknown(self, fullurl, data=None):
 213         """Overridable interface to open unknown URL type."""
 214         type, url = splittype(fullurl)
 215         raise IOError, ('url error', 'unknown url type', type)
 216
 217     def open_unknown_proxy(self, proxy, fullurl, data=None):
 218         """Overridable interface to open unknown URL type."""
 219         type, url = splittype(fullurl)
 220         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 221
 222     # External interface
 223     def retrieve(self, url, filename=None, reporthook=None, data=None):
 224         """retrieve(url) returns (filename, headers) for a local object
 225         or (tempfilename, headers) for a remote object."""
 226         url = unwrap(toBytes(url))
 227         if self.tempcache and url in self.tempcache:
 228             return self.tempcache[url]
 229         type, url1 = splittype(url)
 230         if filename is None and (not type or type == 'file'):
 231             try:
 232                 fp = self.open_local_file(url1)
 233                 hdrs = fp.info()
 234                 del fp
 235                 return url2pathname(splithost(url1)[1]), hdrs
 236             except IOError, msg:
 237                 pass
 238         fp = self.open(url, data)
 239         try:
 240             headers = fp.info()
 241             if filename:
 242                 tfp = open(filename, 'wb')
 243             else:
 244                 import tempfile
 245                 garbage, path = splittype(url)
 246                 garbage, path = splithost(path or "")
 247                 path, garbage = splitquery(path or "")
 248                 path, garbage = splitattr(path or "")
 249                 suffix = os.path.splitext(path)[1]
 250                 (fd, filename) = tempfile.mkstemp(suffix)
 251                 self.__tempfiles.append(filename)
 252                 tfp = os.fdopen(fd, 'wb')
 253             try:
 254                 result = filename, headers
 255                 if self.tempcache is not None:
 256                     self.tempcache[url] = result
 257                 bs = 1024*8
 258                 size = -1
 259                 read = 0
 260                 blocknum = 0
 261                 if reporthook:
 262                     if "content-length" in headers:
 263                         size = int(headers["Content-Length"])
 264                     reporthook(blocknum, bs, size)
 265                 while 1:
 266                     block = fp.read(bs)
 267                     if block == "":
 268                         break
 269                     read += len(block)
 270                     tfp.write(block)
 271                     blocknum += 1
 272                     if reporthook:
 273                         reporthook(blocknum, bs, size)
 274             finally:
 275                 tfp.close()
 276         finally:
 277             fp.close()
 278         del fp
 279         del tfp
 280
 281         # raise exception if actual size does not match content-length header
 282         if size >= 0 and read < size:
 283             raise ContentTooShortError("retrieval incomplete: got only %i out "
 284                                        "of %i bytes" % (read, size), result)
 285
 286         return result
 287
 288     # Each method named open_<type> knows how to open that type of URL
 289
 290     def open_http(self, url, data=None):
 291         """Use HTTP protocol."""
 292         import httplib
 293         user_passwd = None
 294         proxy_passwd= None
 295         if isinstance(url, str):
 296             host, selector = splithost(url)
 297             if host:
 298                 user_passwd, host = splituser(host)
 299                 host = unquote(host)
 300             realhost = host
 301         else:
 302             host, selector = url
 303             # check whether the proxy contains authorization information
 304             proxy_passwd, host = splituser(host)
 305             # now we proceed with the url we want to obtain
 306             urltype, rest = splittype(selector)
 307             url = rest
 308             user_passwd = None
 309             if urltype.lower() != 'http':
 310                 realhost = None
 311             else:
 312                 realhost, rest = splithost(rest)
 313                 if realhost:
 314                     user_passwd, realhost = splituser(realhost)
 315                 if user_passwd:
 316                     selector = "%s://%s%s" % (urltype, realhost, rest)
 317                 if proxy_bypass(realhost):
 318                     host = realhost
 319
 320             #print "proxy via http:", host, selector
 321         if not host: raise IOError, ('http error', 'no host given')
 322
 323         if proxy_passwd:
 324             import base64
 325             proxy_auth = base64.b64encode(proxy_passwd).strip()
 326         else:
 327             proxy_auth = None
 328
 329         if user_passwd:
 330             import base64
 331             auth = base64.b64encode(user_passwd).strip()
 332         else:
 333             auth = None
 334         h = httplib.HTTP(host)
 335         if data is not None:
 336             h.putrequest('POST', selector)
 337             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
 338             h.putheader('Content-Length', '%d' % len(data))
 339         else:
 340             h.putrequest('GET', selector)
 341         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 342         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 343         if realhost: h.putheader('Host', realhost)
 344         for args in self.addheaders: h.putheader(*args)
 345         h.endheaders()
 346         if data is not None:
 347             h.send(data)
 348         errcode, errmsg, headers = h.getreply()
 349         fp = h.getfile()
 350         if errcode == -1:
 351             if fp: fp.close()
 352             # something went wrong with the HTTP status line
 353             raise IOError, ('http protocol error', 0,
 354                             'got a bad status line', None)
 355         # According to RFC 2616, "2xx" code indicates that the client's
 356         # request was successfully received, understood, and accepted.
 357         if (200 <= errcode < 300):
 358             return addinfourl(fp, headers, "http:" + url, errcode)
 359         else:
 360             if data is None:
 361                 return self.http_error(url, fp, errcode, errmsg, headers)
 362             else:
 363                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 364
 365     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 366         """Handle http errors.
 367         Derived class can override this, or provide specific handlers
 368         named http_error_DDD where DDD is the 3-digit error code."""
 369         # First check if there's a specific handler for this error
 370         name = 'http_error_%d' % errcode
 371         if hasattr(self, name):
 372             method = getattr(self, name)
 373             if data is None:
 374                 result = method(url, fp, errcode, errmsg, headers)
 375             else:
 376                 result = method(url, fp, errcode, errmsg, headers, data)
 377             if result: return result
 378         return self.http_error_default(url, fp, errcode, errmsg, headers)
 379
 380     def http_error_default(self, url, fp, errcode, errmsg, headers):
 381         """Default error handler: close the connection and raise IOError."""
 382         void = fp.read()
 383         fp.close()
 384         raise IOError, ('http error', errcode, errmsg, headers)
 385
 386     if _have_ssl:
 387         def open_https(self, url, data=None):
 388             """Use HTTPS protocol."""
 389
 390             import httplib
 391             user_passwd = None
 392             proxy_passwd = None
 393             if isinstance(url, str):
 394                 host, selector = splithost(url)
 395                 if host:
 396                     user_passwd, host = splituser(host)
 397                     host = unquote(host)
 398                 realhost = host
 399             else:
 400                 host, selector = url
 401                 # here, we determine, whether the proxy contains authorization information
 402                 proxy_passwd, host = splituser(host)
 403                 urltype, rest = splittype(selector)
 404                 url = rest
 405                 user_passwd = None
 406                 if urltype.lower() != 'https':
 407                     realhost = None
 408                 else:
 409                     realhost, rest = splithost(rest)
 410                     if realhost:
 411                         user_passwd, realhost = splituser(realhost)
 412                     if user_passwd:
 413                         selector = "%s://%s%s" % (urltype, realhost, rest)
 414                 #print "proxy via https:", host, selector
 415             if not host: raise IOError, ('https error', 'no host given')
 416             if proxy_passwd:
 417                 import base64
 418                 proxy_auth = base64.b64encode(proxy_passwd).strip()
 419             else:
 420                 proxy_auth = None
 421             if user_passwd:
 422                 import base64
 423                 auth = base64.b64encode(user_passwd).strip()
 424             else:
 425                 auth = None
 426             h = httplib.HTTPS(host, 0,
 427                               key_file=self.key_file,
 428                               cert_file=self.cert_file)
 429             if data is not None:
 430                 h.putrequest('POST', selector)
 431                 h.putheader('Content-Type',
 432                             'application/x-www-form-urlencoded')
 433                 h.putheader('Content-Length', '%d' % len(data))
 434             else:
 435                 h.putrequest('GET', selector)
 436             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 437             if auth: h.putheader('Authorization', 'Basic %s' % auth)
 438             if realhost: h.putheader('Host', realhost)
 439             for args in self.addheaders: h.putheader(*args)
 440             h.endheaders()
 441             if data is not None:
 442                 h.send(data)
 443             errcode, errmsg, headers = h.getreply()
 444             fp = h.getfile()
 445             if errcode == -1:
 446                 if fp: fp.close()
 447                 # something went wrong with the HTTP status line
 448                 raise IOError, ('http protocol error', 0,
 449                                 'got a bad status line', None)
 450             # According to RFC 2616, "2xx" code indicates that the client's
 451             # request was successfully received, understood, and accepted.
 452             if (200 <= errcode < 300):
 453                 return addinfourl(fp, headers, "https:" + url, errcode)
 454             else:
 455                 if data is None:
 456                     return self.http_error(url, fp, errcode, errmsg, headers)
 457                 else:
 458                     return self.http_error(url, fp, errcode, errmsg, headers,
 459                                            data)
 460
 461     def open_file(self, url):
 462         """Use local file or FTP depending on form of URL."""
 463         if not isinstance(url, str):
 464             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
 465         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
 466             return self.open_ftp(url)
 467         else:
 468             return self.open_local_file(url)
 469
 470     def open_local_file(self, url):
 471         """Use local file."""
 472         import mimetypes, mimetools, email.utils
 473         try:
 474             from cStringIO import StringIO
 475         except ImportError:
 476             from StringIO import StringIO
 477         host, file = splithost(url)
 478         localname = url2pathname(file)
 479         try:
 480             stats = os.stat(localname)
 481         except OSError, e:
 482             raise IOError(e.errno, e.strerror, e.filename)
 483         size = stats.st_size
 484         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
 485         mtype = mimetypes.guess_type(url)[0]
 486         headers = mimetools.Message(StringIO(
 487             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 488             (mtype or 'text/plain', size, modified)))
 489         if not host:
 490             urlfile = file
 491             if file[:1] == '/':
 492                 urlfile = 'file://' + file
 493             return addinfourl(open(localname, 'rb'),
 494                               headers, urlfile)
 495         host, port = splitport(host)
 496         if not port \
 497            and socket.gethostbyname(host) in (localhost(), thishost()):
 498             urlfile = file
 499             if file[:1] == '/':
 500                 urlfile = 'file://' + file
 501             return addinfourl(open(localname, 'rb'),
 502                               headers, urlfile)
 503         raise IOError, ('local file error', 'not on local host')
 504
 505     def open_ftp(self, url):
 506         """Use FTP protocol."""
 507         if not isinstance(url, str):
 508             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
 509         import mimetypes, mimetools
 510         try:
 511             from cStringIO import StringIO
 512         except ImportError:
 513             from StringIO import StringIO
 514         host, path = splithost(url)
 515         if not host: raise IOError, ('ftp error', 'no host given')
 516         host, port = splitport(host)
 517         user, host = splituser(host)
 518         if user: user, passwd = splitpasswd(user)
 519         else: passwd = None
 520         host = unquote(host)
 521         user = unquote(user or '')
 522         passwd = unquote(passwd or '')
 523         host = socket.gethostbyname(host)
 524         if not port:
 525             import ftplib
 526             port = ftplib.FTP_PORT
 527         else:
 528             port = int(port)
 529         path, attrs = splitattr(path)
 530         path = unquote(path)
 531         dirs = path.split('/')
 532         dirs, file = dirs[:-1], dirs[-1]
 533         if dirs and not dirs[0]: dirs = dirs[1:]
 534         if dirs and not dirs[0]: dirs[0] = '/'
 535         key = user, host, port, '/'.join(dirs)
 536         # XXX thread unsafe!
 537         if len(self.ftpcache) > MAXFTPCACHE:
 538             # Prune the cache, rather arbitrarily
 539             for k in self.ftpcache.keys():
 540                 if k != key:
 541                     v = self.ftpcache[k]
 542                     del self.ftpcache[k]
 543                     v.close()
 544         try:
 545             if not key in self.ftpcache:
 546                 self.ftpcache[key] = \
 547                     ftpwrapper(user, passwd, host, port, dirs)
 548             if not file: type = 'D'
 549             else: type = 'I'
 550             for attr in attrs:
 551                 attr, value = splitvalue(attr)
 552                 if attr.lower() == 'type' and \
 553                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 554                     type = value.upper()
 555             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 556             mtype = mimetypes.guess_type("ftp:" + url)[0]
 557             headers = ""
 558             if mtype:
 559                 headers += "Content-Type: %s\n" % mtype
 560             if retrlen is not None and retrlen >= 0:
 561                 headers += "Content-Length: %d\n" % retrlen
 562             headers = mimetools.Message(StringIO(headers))
 563             return addinfourl(fp, headers, "ftp:" + url)
 564         except ftperrors(), msg:
 565             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 566
 567     def open_data(self, url, data=None):
 568         """Use "data" URL."""
 569         if not isinstance(url, str):
 570             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
 571         # ignore POSTed data
 572         #
 573         # syntax of data URLs:
 574         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 575         # mediatype := [ type "/" subtype ] *( ";" parameter )
 576         # data      := *urlchar
 577         # parameter := attribute "=" value
 578         import mimetools
 579         try:
 580             from cStringIO import StringIO
 581         except ImportError:
 582             from StringIO import StringIO
 583         try:
 584             [type, data] = url.split(',', 1)
 585         except ValueError:
 586             raise IOError, ('data error', 'bad data URL')
 587         if not type:
 588             type = 'text/plain;charset=US-ASCII'
 589         semi = type.rfind(';')
 590         if semi >= 0 and '=' not in type[semi:]:
 591             encoding = type[semi+1:]
 592             type = type[:semi]
 593         else:
 594             encoding = ''
 595         msg = []
 596         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 597                                             time.gmtime(time.time())))
 598         msg.append('Content-type: %s' % type)
 599         if encoding == 'base64':
 600             import base64
 601             data = base64.decodestring(data)
 602         else:
 603             data = unquote(data)
 604         msg.append('Content-Length: %d' % len(data))
 605         msg.append('')
 606         msg.append(data)
 607         msg = '\n'.join(msg)
 608         f = StringIO(msg)
 609         headers = mimetools.Message(f, 0)
 610         #f.fileno = None     # needed for addinfourl
 611         return addinfourl(f, headers, url)
 612
 613
 614 class FancyURLopener(URLopener):
 615     """Derived class with handlers for errors we can handle (perhaps)."""
 616
 617     def __init__(self, *args, **kwargs):
 618         URLopener.__init__(self, *args, **kwargs)
 619         self.auth_cache = {}
 620         self.tries = 0
 621         self.maxtries = 10
 622
 623     def http_error_default(self, url, fp, errcode, errmsg, headers):
 624         """Default error handling -- don't raise an exception."""
 625         return addinfourl(fp, headers, "http:" + url, errcode)
 626
 627     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 628         """Error 302 -- relocated (temporarily)."""
 629         self.tries += 1
 630         if self.maxtries and self.tries >= self.maxtries:
 631             if hasattr(self, "http_error_500"):
 632                 meth = self.http_error_500
 633             else:
 634                 meth = self.http_error_default
 635             self.tries = 0
 636             return meth(url, fp, 500,
 637                         "Internal Server Error: Redirect Recursion", headers)
 638         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 639                                         data)
 640         self.tries = 0
 641         return result
 642
 643     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 644         if 'location' in headers:
 645             newurl = headers['location']
 646         elif 'uri' in headers:
 647             newurl = headers['uri']
 648         else:
 649             return
 650         void = fp.read()
 651         fp.close()
 652         # In case the server sent a relative URL, join with original:
 653         newurl = basejoin(self.type + ":" + url, newurl)
 654         return self.open(newurl)
 655
 656     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 657         """Error 301 -- also relocated (permanently)."""
 658         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 659
 660     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
 661         """Error 303 -- also relocated (essentially identical to 302)."""
 662         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 663
 664     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
 665         """Error 307 -- relocated, but turn POST into error."""
 666         if data is None:
 667             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 668         else:
 669             return self.http_error_default(url, fp, errcode, errmsg, headers)
 670
 671     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 672         """Error 401 -- authentication required.
 673         This function supports Basic authentication only."""
 674         if not 'www-authenticate' in headers:
 675             URLopener.http_error_default(self, url, fp,
 676                                          errcode, errmsg, headers)
 677         stuff = headers['www-authenticate']
 678         import re
 679         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 680         if not match:
 681             URLopener.http_error_default(self, url, fp,
 682                                          errcode, errmsg, headers)
 683         scheme, realm = match.groups()
 684         if scheme.lower() != 'basic':
 685             URLopener.http_error_default(self, url, fp,
 686                                          errcode, errmsg, headers)
 687         name = 'retry_' + self.type + '_basic_auth'
 688         if data is None:
 689             return getattr(self,name)(url, realm)
 690         else:
 691             return getattr(self,name)(url, realm, data)
 692
 693     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
 694         """Error 407 -- proxy authentication required.
 695         This function supports Basic authentication only."""
 696         if not 'proxy-authenticate' in headers:
 697             URLopener.http_error_default(self, url, fp,
 698                                          errcode, errmsg, headers)
 699         stuff = headers['proxy-authenticate']
 700         import re
 701         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 702         if not match:
 703             URLopener.http_error_default(self, url, fp,
 704                                          errcode, errmsg, headers)
 705         scheme, realm = match.groups()
 706         if scheme.lower() != 'basic':
 707             URLopener.http_error_default(self, url, fp,
 708                                          errcode, errmsg, headers)
 709         name = 'retry_proxy_' + self.type + '_basic_auth'
 710         if data is None:
 711             return getattr(self,name)(url, realm)
 712         else:
 713             return getattr(self,name)(url, realm, data)
 714
 715     def retry_proxy_http_basic_auth(self, url, realm, data=None):
 716         host, selector = splithost(url)
 717         newurl = 'http://' + host + selector
 718         proxy = self.proxies['http']
 719         urltype, proxyhost = splittype(proxy)
 720         proxyhost, proxyselector = splithost(proxyhost)
 721         i = proxyhost.find('@') + 1
 722         proxyhost = proxyhost[i:]
 723         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 724         if not (user or passwd): return None
 725         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 726         self.proxies['http'] = 'http://' + proxyhost + proxyselector
 727         if data is None:
 728             return self.open(newurl)
 729         else:
 730             return self.open(newurl, data)
 731
 732     def retry_proxy_https_basic_auth(self, url, realm, data=None):
 733         host, selector = splithost(url)
 734         newurl = 'https://' + host + selector
 735         proxy = self.proxies['https']
 736         urltype, proxyhost = splittype(proxy)
 737         proxyhost, proxyselector = splithost(proxyhost)
 738         i = proxyhost.find('@') + 1
 739         proxyhost = proxyhost[i:]
 740         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 741         if not (user or passwd): return None
 742         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 743         self.proxies['https'] = 'https://' + proxyhost + proxyselector
 744         if data is None:
 745             return self.open(newurl)
 746         else:
 747             return self.open(newurl, data)
 748
 749     def retry_http_basic_auth(self, url, realm, data=None):
 750         host, selector = splithost(url)
 751         i = host.find('@') + 1
 752         host = host[i:]
 753         user, passwd = self.get_user_passwd(host, realm, i)
 754         if not (user or passwd): return None
 755         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 756         newurl = 'http://' + host + selector
 757         if data is None:
 758             return self.open(newurl)
 759         else:
 760             return self.open(newurl, data)
 761
 762     def retry_https_basic_auth(self, url, realm, data=None):
 763         host, selector = splithost(url)
 764         i = host.find('@') + 1
 765         host = host[i:]
 766         user, passwd = self.get_user_passwd(host, realm, i)
 767         if not (user or passwd): return None
 768         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 769         newurl = 'https://' + host + selector
 770         if data is None:
 771             return self.open(newurl)
 772         else:
 773             return self.open(newurl, data)
 774
 775     def get_user_passwd(self, host, realm, clear_cache = 0):
 776         key = realm + '@' + host.lower()
 777         if key in self.auth_cache:
 778             if clear_cache:
 779                 del self.auth_cache[key]
 780             else:
 781                 return self.auth_cache[key]
 782         user, passwd = self.prompt_user_passwd(host, realm)
 783         if user or passwd: self.auth_cache[key] = (user, passwd)
 784         return user, passwd
 785
 786     def prompt_user_passwd(self, host, realm):
 787         """Override this in a GUI environment!"""
 788         import getpass
 789         try:
 790             user = raw_input("Enter username for %s at %s: " % (realm,
 791                                                                 host))
 792             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 793                 (user, realm, host))
 794             return user, passwd
 795         except KeyboardInterrupt:
 796             print
 797             return None, None
 798
 799
 800 # Utility functions
 801
 802 _localhost = None
 803 def localhost():
 804     """Return the IP address of the magic hostname 'localhost'."""
 805     global _localhost
 806     if _localhost is None:
 807         _localhost = socket.gethostbyname('localhost')
 808     return _localhost
 809
 810 _thishost = None
 811 def thishost():
 812     """Return the IP address of the current host."""
 813     global _thishost
 814     if _thishost is None:
 815         _thishost = socket.gethostbyname(socket.gethostname())
 816     return _thishost
 817
 818 _ftperrors = None
 819 def ftperrors():
 820     """Return the set of errors raised by the FTP class."""
 821     global _ftperrors
 822     if _ftperrors is None:
 823         import ftplib
 824         _ftperrors = ftplib.all_errors
 825     return _ftperrors
 826
 827 _noheaders = None
 828 def noheaders():
 829     """Return an empty mimetools.Message object."""
 830     global _noheaders
 831     if _noheaders is None:
 832         import mimetools
 833         try:
 834             from cStringIO import StringIO
 835         except ImportError:
 836             from StringIO import StringIO
 837         _noheaders = mimetools.Message(StringIO(), 0)
 838         _noheaders.fp.close()   # Recycle file descriptor
 839     return _noheaders
 840
 841
 842 # Utility classes
 843
 844 class ftpwrapper:
 845     """Class used by open_ftp() for cache of open FTP connections."""
 846
 847     def __init__(self, user, passwd, host, port, dirs,
 848                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 849         self.user = user
 850         self.passwd = passwd
 851         self.host = host
 852         self.port = port
 853         self.dirs = dirs
 854         self.timeout = timeout
 855         self.init()
 856
 857     def init(self):
 858         import ftplib
 859         self.busy = 0
 860         self.ftp = ftplib.FTP()
 861         self.ftp.connect(self.host, self.port, self.timeout)
 862         self.ftp.login(self.user, self.passwd)
 863         for dir in self.dirs:
 864             self.ftp.cwd(dir)
 865
 866     def retrfile(self, file, type):
 867         import ftplib
 868         self.endtransfer()
 869         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 870         else: cmd = 'TYPE ' + type; isdir = 0
 871         try:
 872             self.ftp.voidcmd(cmd)
 873         except ftplib.all_errors:
 874             self.init()
 875             self.ftp.voidcmd(cmd)
 876         conn = None
 877         if file and not isdir:
 878             # Try to retrieve as a file
 879             try:
 880                 cmd = 'RETR ' + file
 881                 conn = self.ftp.ntransfercmd(cmd)
 882             except ftplib.error_perm, reason:
 883                 if str(reason)[:3] != '550':
 884                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 885         if not conn:
 886             # Set transfer mode to ASCII!
 887             self.ftp.voidcmd('TYPE A')
 888             # Try a directory listing. Verify that directory exists.
 889             if file:
 890                 pwd = self.ftp.pwd()
 891                 try:
 892                     try:
 893                         self.ftp.cwd(file)
 894                     except ftplib.error_perm, reason:
 895                         raise IOError, ('ftp error', reason), sys.exc_info()[2]
 896                 finally:
 897                     self.ftp.cwd(pwd)
 898                 cmd = 'LIST ' + file
 899             else:
 900                 cmd = 'LIST'
 901             conn = self.ftp.ntransfercmd(cmd)
 902         self.busy = 1
 903         # Pass back both a suitably decorated object and a retrieval length
 904         return (addclosehook(conn[0].makefile('rb'),
 905                              self.endtransfer), conn[1])
 906     def endtransfer(self):
 907         if not self.busy:
 908             return
 909         self.busy = 0
 910         try:
 911             self.ftp.voidresp()
 912         except ftperrors():
 913             pass
 914
 915     def close(self):
 916         self.endtransfer()
 917         try:
 918             self.ftp.close()
 919         except ftperrors():
 920             pass
 921
 922 class addbase:
 923     """Base class for addinfo and addclosehook."""
 924
 925     def __init__(self, fp):
 926         self.fp = fp
 927         self.read = self.fp.read
 928         self.readline = self.fp.readline
 929         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 930         if hasattr(self.fp, "fileno"):
 931             self.fileno = self.fp.fileno
 932         else:
 933             self.fileno = lambda: None
 934         if hasattr(self.fp, "__iter__"):
 935             self.__iter__ = self.fp.__iter__
 936             if hasattr(self.fp, "next"):
 937                 self.next = self.fp.next
 938
 939     def __repr__(self):
 940         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
 941                                              id(self), self.fp)
 942
 943     def close(self):
 944         self.read = None
 945         self.readline = None
 946         self.readlines = None
 947         self.fileno = None
 948         if self.fp: self.fp.close()
 949         self.fp = None
 950
 951 class addclosehook(addbase):
 952     """Class to add a close hook to an open file."""
 953
 954     def __init__(self, fp, closehook, *hookargs):
 955         addbase.__init__(self, fp)
 956         self.closehook = closehook
 957         self.hookargs = hookargs
 958
 959     def close(self):
 960         addbase.close(self)
 961         if self.closehook:
 962             self.closehook(*self.hookargs)
 963             self.closehook = None
 964             self.hookargs = None
 965
 966 class addinfo(addbase):
 967     """class to add an info() method to an open file."""
 968
 969     def __init__(self, fp, headers):
 970         addbase.__init__(self, fp)
 971         self.headers = headers
 972
 973     def info(self):
 974         return self.headers
 975
 976 class addinfourl(addbase):
 977     """class to add info() and geturl() methods to an open file."""
 978
 979     def __init__(self, fp, headers, url, code=None):
 980         addbase.__init__(self, fp)
 981         self.headers = headers
 982         self.url = url
 983         self.code = code
 984
 985     def info(self):
 986         return self.headers
 987
 988     def getcode(self):
 989         return self.code
 990
 991     def geturl(self):
 992         return self.url
 993
 994
 995 # Utilities to parse URLs (most of these return None for missing parts):
 996 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 997 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 998 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 999 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
1000 # splitpasswd('user:passwd') -> 'user', 'passwd'
1001 # splitport('host:port') --> 'host', 'port'
1002 # splitquery('/path?query') --> '/path', 'query'
1003 # splittag('/path#tag') --> '/path', 'tag'
1004 # splitattr('/path;attr1=value1;attr2=value2;...') ->
1005 #   '/path', ['attr1=value1', 'attr2=value2', ...]
1006 # splitvalue('attr=value') --> 'attr', 'value'
1007 # unquote('abc%20def') -> 'abc def'
1008 # quote('abc def') -> 'abc%20def')
1009
1010 try:
1011     unicode
1012 except NameError:
1013     def _is_unicode(x):
1014         return 0
1015 else:
1016     def _is_unicode(x):
1017         return isinstance(x, unicode)
1018
1019 def toBytes(url):
1020     """toBytes(u"URL") --> 'URL'."""
1021     # Most URL schemes require ASCII. If that changes, the conversion
1022     # can be relaxed
1023     if _is_unicode(url):
1024         try:
1025             url = url.encode("ASCII")
1026         except UnicodeError:
1027             raise UnicodeError("URL " + repr(url) +
1028                                " contains non-ASCII characters")
1029     return url
1030
1031 def unwrap(url):
1032     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1033     url = url.strip()
1034     if url[:1] == '<' and url[-1:] == '>':
1035         url = url[1:-1].strip()
1036     if url[:4] == 'URL:': url = url[4:].strip()
1037     return url
1038
1039 _typeprog = None
1040 def splittype(url):
1041     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1042     global _typeprog
1043     if _typeprog is None:
1044         import re
1045         _typeprog = re.compile('^([^/:]+):')
1046
1047     match = _typeprog.match(url)
1048     if match:
1049         scheme = match.group(1)
1050         return scheme.lower(), url[len(scheme) + 1:]
1051     return None, url
1052
1053 _hostprog = None
1054 def splithost(url):
1055     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1056     global _hostprog
1057     if _hostprog is None:
1058         import re
1059         _hostprog = re.compile('^//([^/?]*)(.*)$')
1060
1061     match = _hostprog.match(url)
1062     if match: return match.group(1, 2)
1063     return None, url
1064
1065 _userprog = None
1066 def splituser(host):
1067     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1068     global _userprog
1069     if _userprog is None:
1070         import re
1071         _userprog = re.compile('^(.*)@(.*)$')
1072
1073     match = _userprog.match(host)
1074     if match: return map(unquote, match.group(1, 2))
1075     return None, host
1076
1077 _passwdprog = None
1078 def splitpasswd(user):
1079     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1080     global _passwdprog
1081     if _passwdprog is None:
1082         import re
1083         _passwdprog = re.compile('^([^:]*):(.*)$')
1084
1085     match = _passwdprog.match(user)
1086     if match: return match.group(1, 2)
1087     return user, None
1088
1089 # splittag('/path#tag') --> '/path', 'tag'
1090 _portprog = None
1091 def splitport(host):
1092     """splitport('host:port') --> 'host', 'port'."""
1093     global _portprog
1094     if _portprog is None:
1095         import re
1096         _portprog = re.compile('^(.*):([0-9]+)$')
1097
1098     match = _portprog.match(host)
1099     if match: return match.group(1, 2)
1100     return host, None
1101
1102 _nportprog = None
1103 def splitnport(host, defport=-1):
1104     """Split host and port, returning numeric port.
1105     Return given default port if no ':' found; defaults to -1.
1106     Return numerical port if a valid number are found after ':'.
1107     Return None if ':' but not a valid number."""
1108     global _nportprog
1109     if _nportprog is None:
1110         import re
1111         _nportprog = re.compile('^(.*):(.*)$')
1112
1113     match = _nportprog.match(host)
1114     if match:
1115         host, port = match.group(1, 2)
1116         try:
1117             if not port: raise ValueError, "no digits"
1118             nport = int(port)
1119         except ValueError:
1120             nport = None
1121         return host, nport
1122     return host, defport
1123
1124 _queryprog = None
1125 def splitquery(url):
1126     """splitquery('/path?query') --> '/path', 'query'."""
1127     global _queryprog
1128     if _queryprog is None:
1129         import re
1130         _queryprog = re.compile('^(.*)\?([^?]*)$')
1131
1132     match = _queryprog.match(url)
1133     if match: return match.group(1, 2)
1134     return url, None
1135
1136 _tagprog = None
1137 def splittag(url):
1138     """splittag('/path#tag') --> '/path', 'tag'."""
1139     global _tagprog
1140     if _tagprog is None:
1141         import re
1142         _tagprog = re.compile('^(.*)#([^#]*)$')
1143
1144     match = _tagprog.match(url)
1145     if match: return match.group(1, 2)
1146     return url, None
1147
1148 def splitattr(url):
1149     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1150         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1151     words = url.split(';')
1152     return words[0], words[1:]
1153
1154 _valueprog = None
1155 def splitvalue(attr):
1156     """splitvalue('attr=value') --> 'attr', 'value'."""
1157     global _valueprog
1158     if _valueprog is None:
1159         import re
1160         _valueprog = re.compile('^([^=]*)=(.*)$')
1161
1162     match = _valueprog.match(attr)
1163     if match: return match.group(1, 2)
1164     return attr, None
1165
1166 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1167 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1168
1169 def unquote(s):
1170     """unquote('abc%20def') -> 'abc def'."""
1171     res = s.split('%')
1172     for i in xrange(1, len(res)):
1173         item = res[i]
1174         try:
1175             res[i] = _hextochr[item[:2]] + item[2:]
1176         except KeyError:
1177             res[i] = '%' + item
1178         except UnicodeDecodeError:
1179             res[i] = unichr(int(item[:2], 16)) + item[2:]
1180     return "".join(res)
1181
1182 def unquote_plus(s):
1183     """unquote('%7e/abc+def') -> '~/abc def'"""
1184     s = s.replace('+', ' ')
1185     return unquote(s)
1186
1187 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1188                'abcdefghijklmnopqrstuvwxyz'
1189                '0123456789' '_.-')
1190 _safemaps = {}
1191
1192 def quote(s, safe = '/'):
1193     """quote('abc def') -> 'abc%20def'
1194
1195     Each part of a URL, e.g. the path info, the query, etc., has a
1196     different set of reserved characters that must be quoted.
1197
1198     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1199     the following reserved characters.
1200
1201     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1202                   "$" | ","
1203
1204     Each of these characters is reserved in some component of a URL,
1205     but not necessarily in all of them.
1206
1207     By default, the quote function is intended for quoting the path
1208     section of a URL.  Thus, it will not encode '/'.  This character
1209     is reserved, but in typical usage the quote function is being
1210     called on a path where the existing slash characters are used as
1211     reserved characters.
1212     """
1213     cachekey = (safe, always_safe)
1214     try:
1215         safe_map = _safemaps[cachekey]
1216     except KeyError:
1217         safe += always_safe
1218         safe_map = {}
1219         for i in range(256):
1220             c = chr(i)
1221             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1222         _safemaps[cachekey] = safe_map
1223     res = map(safe_map.__getitem__, s)
1224     return ''.join(res)
1225
1226 def quote_plus(s, safe = ''):
1227     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1228     if ' ' in s:
1229         s = quote(s, safe + ' ')
1230         return s.replace(' ', '+')
1231     return quote(s, safe)
1232
1233 def urlencode(query,doseq=0):
1234     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1235
1236     If any values in the query arg are sequences and doseq is true, each
1237     sequence element is converted to a separate parameter.
1238
1239     If the query arg is a sequence of two-element tuples, the order of the
1240     parameters in the output will match the order of parameters in the
1241     input.
1242     """
1243
1244     if hasattr(query,"items"):
1245         # mapping objects
1246         query = query.items()
1247     else:
1248         # it's a bother at times that strings and string-like objects are
1249         # sequences...
1250         try:
1251             # non-sequence items should not work with len()
1252             # non-empty strings will fail this
1253             if len(query) and not isinstance(query[0], tuple):
1254                 raise TypeError
1255             # zero-length sequences of all types will get here and succeed,
1256             # but that's a minor nit - since the original implementation
1257             # allowed empty dicts that type of behavior probably should be
1258             # preserved for consistency
1259         except TypeError:
1260             ty,va,tb = sys.exc_info()
1261             raise TypeError, "not a valid non-string sequence or mapping object", tb
1262
1263     l = []
1264     if not doseq:
1265         # preserve old behavior
1266         for k, v in query:
1267             k = quote_plus(str(k))
1268             v = quote_plus(str(v))
1269             l.append(k + '=' + v)
1270     else:
1271         for k, v in query:
1272             k = quote_plus(str(k))
1273             if isinstance(v, str):
1274                 v = quote_plus(v)
1275                 l.append(k + '=' + v)
1276             elif _is_unicode(v):
1277                 # is there a reasonable way to convert to ASCII?
1278                 # encode generates a string, but "replace" or "ignore"
1279                 # lose information and "strict" can raise UnicodeError
1280                 v = quote_plus(v.encode("ASCII","replace"))
1281                 l.append(k + '=' + v)
1282             else:
1283                 try:
1284                     # is this a sufficient test for sequence-ness?
1285                     x = len(v)
1286                 except TypeError:
1287                     # not a sequence
1288                     v = quote_plus(str(v))
1289                     l.append(k + '=' + v)
1290                 else:
1291                     # loop over the sequence
1292                     for elt in v:
1293                         l.append(k + '=' + quote_plus(str(elt)))
1294     return '&'.join(l)
1295
1296 # Proxy handling
1297 def getproxies_environment():
1298     """Return a dictionary of scheme -> proxy server URL mappings.
1299
1300     Scan the environment for variables named <scheme>_proxy;
1301     this seems to be the standard convention.  If you need a
1302     different way, you can pass a proxies dictionary to the
1303     [Fancy]URLopener constructor.
1304
1305     """
1306     proxies = {}
1307     for name, value in os.environ.items():
1308         name = name.lower()
1309         if value and name[-6:] == '_proxy':
1310             proxies[name[:-6]] = value
1311     return proxies
1312
1313 def proxy_bypass_environment(host):
1314     """Test if proxies should not be used for a particular host.
1315
1316     Checks the environment for a variable named no_proxy, which should
1317     be a list of DNS suffixes separated by commas, or '*' for all hosts.
1318     """
1319     no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1320     # '*' is special case for always bypass
1321     if no_proxy == '*':
1322         return 1
1323     # strip port off host
1324     hostonly, port = splitport(host)
1325     # check if the host ends with any of the DNS suffixes
1326     for name in no_proxy.split(','):
1327         if name and (hostonly.endswith(name) or host.endswith(name)):
1328             return 1
1329     # otherwise, don't bypass
1330     return 0
1331
1332
1333 if sys.platform == 'darwin':
1334     from _scproxy import _get_proxy_settings, _get_proxies
1335
1336     def proxy_bypass_macosx_sysconf(host):
1337         """
1338         Return True iff this host shouldn't be accessed using a proxy
1339
1340         This function uses the MacOSX framework SystemConfiguration
1341         to fetch the proxy information.
1342         """
1343         import re
1344         import socket
1345         from fnmatch import fnmatch
1346
1347         hostonly, port = splitport(host)
1348
1349         def ip2num(ipAddr):
1350             parts = ipAddr.split('.')
1351             parts = map(int, parts)
1352             if len(parts) != 4:
1353                 parts = (parts + [0, 0, 0, 0])[:4]
1354             return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1355
1356         proxy_settings = _get_proxy_settings()
1357
1358         # Check for simple host names:
1359         if '.' not in host:
1360             if proxy_settings['exclude_simple']:
1361                 return True
1362
1363         hostIP = None
1364
1365         for value in proxy_settings.get('exceptions', ()):
1366             # Items in the list are strings like these: *.local, 169.254/16
1367             if not value: continue
1368
1369             m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1370             if m is not None:
1371                 if hostIP is None:
1372                     try:
1373                         hostIP = socket.gethostbyname(hostonly)
1374                         hostIP = ip2num(hostIP)
1375                     except socket.error:
1376                         continue
1377
1378                 base = ip2num(m.group(1))
1379                 mask = int(m.group(2)[1:])
1380                 mask = 32 - mask
1381
1382                 if (hostIP >> mask) == (base >> mask):
1383                     return True
1384
1385             elif fnmatch(host, value):
1386                 return True
1387
1388         return False
1389
1390
1391     def getproxies_macosx_sysconf():
1392         """Return a dictionary of scheme -> proxy server URL mappings.
1393
1394         This function uses the MacOSX framework SystemConfiguration
1395         to fetch the proxy information.
1396         """
1397         return _get_proxies()
1398
1399
1400
1401     def proxy_bypass(host):
1402         if getproxies_environment():
1403             return proxy_bypass_environment(host)
1404         else:
1405             return proxy_bypass_macosx_sysconf(host)
1406
1407     def getproxies():
1408         return getproxies_environment() or getproxies_macosx_sysconf()
1409
1410 elif os.name == 'nt':
1411     def getproxies_registry():
1412         """Return a dictionary of scheme -> proxy server URL mappings.
1413
1414         Win32 uses the registry to store proxies.
1415
1416         """
1417         proxies = {}
1418         try:
1419             import _winreg
1420         except ImportError:
1421             # Std module, so should be around - but you never know!
1422             return proxies
1423         try:
1424             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1425                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1426             proxyEnable = _winreg.QueryValueEx(internetSettings,
1427                                                'ProxyEnable')[0]
1428             if proxyEnable:
1429                 # Returned as Unicode but problems if not converted to ASCII
1430                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1431                                                        'ProxyServer')[0])
1432                 if '=' in proxyServer:
1433                     # Per-protocol settings
1434                     for p in proxyServer.split(';'):
1435                         protocol, address = p.split('=', 1)
1436                         # See if address has a type:// prefix
1437                         import re
1438                         if not re.match('^([^/:]+)://', address):
1439                             address = '%s://%s' % (protocol, address)
1440                         proxies[protocol] = address
1441                 else:
1442                     # Use one setting for all protocols
1443                     if proxyServer[:5] == 'http:':
1444                         proxies['http'] = proxyServer
1445                     else:
1446                         proxies['http'] = 'http://%s' % proxyServer
1447                         proxies['ftp'] = 'ftp://%s' % proxyServer
1448             internetSettings.Close()
1449         except (WindowsError, ValueError, TypeError):
1450             # Either registry key not found etc, or the value in an
1451             # unexpected format.
1452             # proxies already set up to be empty so nothing to do
1453             pass
1454         return proxies
1455
1456     def getproxies():
1457         """Return a dictionary of scheme -> proxy server URL mappings.
1458
1459         Returns settings gathered from the environment, if specified,
1460         or the registry.
1461
1462         """
1463         return getproxies_environment() or getproxies_registry()
1464
1465     def proxy_bypass_registry(host):
1466         try:
1467             import _winreg
1468             import re
1469         except ImportError:
1470             # Std modules, so should be around - but you never know!
1471             return 0
1472         try:
1473             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1474                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1475             proxyEnable = _winreg.QueryValueEx(internetSettings,
1476                                                'ProxyEnable')[0]
1477             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1478                                                      'ProxyOverride')[0])
1479             # ^^^^ Returned as Unicode but problems if not converted to ASCII
1480         except WindowsError:
1481             return 0
1482         if not proxyEnable or not proxyOverride:
1483             return 0
1484         # try to make a host list from name and IP address.
1485         rawHost, port = splitport(host)
1486         host = [rawHost]
1487         try:
1488             addr = socket.gethostbyname(rawHost)
1489             if addr != rawHost:
1490                 host.append(addr)
1491         except socket.error:
1492             pass
1493         try:
1494             fqdn = socket.getfqdn(rawHost)
1495             if fqdn != rawHost:
1496                 host.append(fqdn)
1497         except socket.error:
1498             pass
1499         # make a check value list from the registry entry: replace the
1500         # '<local>' string by the localhost entry and the corresponding
1501         # canonical entry.
1502         proxyOverride = proxyOverride.split(';')
1503         i = 0
1504         while i < len(proxyOverride):
1505             if proxyOverride[i] == '<local>':
1506                 proxyOverride[i:i+1] = ['localhost',
1507                                         '127.0.0.1',
1508                                         socket.gethostname(),
1509                                         socket.gethostbyname(
1510                                             socket.gethostname())]
1511             i += 1
1512         # print proxyOverride
1513         # now check if we match one of the registry values.
1514         for test in proxyOverride:
1515             test = test.replace(".", r"\.")     # mask dots
1516             test = test.replace("*", r".*")     # change glob sequence
1517             test = test.replace("?", r".")      # change glob char
1518             for val in host:
1519                 # print "%s <--> %s" %( test, val )
1520                 if re.match(test, val, re.I):
1521                     return 1
1522         return 0
1523
1524     def proxy_bypass(host):
1525         """Return a dictionary of scheme -> proxy server URL mappings.
1526
1527         Returns settings gathered from the environment, if specified,
1528         or the registry.
1529
1530         """
1531         if getproxies_environment():
1532             return proxy_bypass_environment(host)
1533         else:
1534             return proxy_bypass_registry(host)
1535
1536 else:
1537     # By default use environment variables
1538     getproxies = getproxies_environment
1539     proxy_bypass = proxy_bypass_environment
1540
1541 # Test and time quote() and unquote()
1542 def test1():
1543     s = ''
1544     for i in range(256): s = s + chr(i)
1545     s = s*4
1546     t0 = time.time()
1547     qs = quote(s)
1548     uqs = unquote(qs)
1549     t1 = time.time()
1550     if uqs != s:
1551         print 'Wrong!'
1552     print repr(s)
1553     print repr(qs)
1554     print repr(uqs)
1555     print round(t1 - t0, 3), 'sec'
1556
1557
1558 def reporthook(blocknum, blocksize, totalsize):
1559     # Report during remote transfers
1560     print "Block number: %d, Block size: %d, Total size: %d" % (
1561         blocknum, blocksize, totalsize)
1562
1563 # Test program
1564 def test(args=[]):
1565     if not args:
1566         args = [
1567             '/etc/passwd',
1568             'file:/etc/passwd',
1569             'file://localhost/etc/passwd',
1570             'ftp://ftp.gnu.org/pub/README',
1571             'http://www.python.org/index.html',
1572             ]
1573         if hasattr(URLopener, "open_https"):
1574             args.append('https://synergy.as.cmu.edu/~geek/')
1575     try:
1576         for url in args:
1577             print '-'*10, url, '-'*10
1578             fn, h = urlretrieve(url, None, reporthook)
1579             print fn
1580             if h:
1581                 print '======'
1582                 for k in h.keys(): print k + ':', h[k]
1583                 print '======'
1584             fp = open(fn, 'rb')
1585             data = fp.read()
1586             del fp
1587             if '\r' in data:
1588                 table = string.maketrans("", "")
1589                 data = data.translate(table, "\r")
1590             print data
1591             fn, h = None, None
1592         print '-'*40
1593     finally:
1594         urlcleanup()
1595
1596 def main():
1597     import getopt, sys
1598     try:
1599         opts, args = getopt.getopt(sys.argv[1:], "th")
1600     except getopt.error, msg:
1601         print msg
1602         print "Use -h for help"
1603         return
1604     t = 0
1605     for o, a in opts:
1606         if o == '-t':
1607             t = t + 1
1608         if o == '-h':
1609             print "Usage: python urllib.py [-t] [url ...]"
1610             print "-t runs self-test;",
1611             print "otherwise, contents of urls are printed"
1612             return
1613     if t:
1614         if t > 1:
1615             test1()
1616         test(args)
1617     else:
1618         if not args:
1619             print "Use -h for help"
1620         for url in args:
1621             print urlopen(url).read(),
1622
1623 # Run test program when run as a script
1624 if __name__ == '__main__':
1625     main()