1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
30 from urlparse import urljoin as basejoin
33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35 "urlencode", "url2pathname", "pathname2url", "splittag",
36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38 "splitnport", "splitquery", "splitattr", "splitvalue",
41 __version__ = '1.17' # XXX This version is not always updated :-(
43 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
45 # Helper for non-unix systems
47 from macurl2path import url2pathname, pathname2url
49 from nturl2path import url2pathname, pathname2url
50 elif os.name == 'riscos':
51 from rourl2path import url2pathname, pathname2url
53 def url2pathname(pathname):
54 """OS-specific conversion from a relative URL of the 'file' scheme
55 to a file system path; not recommended for general use."""
56 return unquote(pathname)
58 def pathname2url(pathname):
59 """OS-specific conversion from a file system path to a relative URL
60 of the 'file' scheme; not recommended for general use."""
61 return quote(pathname)
63 # This really consists of two pieces:
64 # (1) a class which handles opening of all sorts of URLs
65 # (plus assorted utilities etc.)
66 # (2) a set of functions for parsing URLs
67 # XXX Should these be separated out into different modules?
70 # Shortcut for basic usage
72 def urlopen(url, data=None, proxies=None):
73 """Create a file-like object for the specified URL to read from."""
74 from warnings import warnpy3k
75 warnings.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
76 "favor of urllib2.urlopen()", stacklevel=2)
79 if proxies is not None:
80 opener = FancyURLopener(proxies=proxies)
82 opener = FancyURLopener()
87 return opener.open(url)
89 return opener.open(url, data)
90 def urlretrieve(url, filename=None, reporthook=None, data=None):
93 _urlopener = FancyURLopener()
94 return _urlopener.retrieve(url, filename, reporthook, data)
107 # exception raised when downloaded size does not match content-length
108 class ContentTooShortError(IOError):
109 def __init__(self, message, content):
110 IOError.__init__(self, message)
111 self.content = content
115 """Class to open URLs.
116 This is a class rather than just a subroutine because we may need
117 more than one set of global protocol-specific options.
118 Note -- this is a base class for those who don't want the
119 automatic handling of errors type 302 (relocated) and 401
120 (authorization needed)."""
124 version = "Python-urllib/%s" % __version__
127 def __init__(self, proxies=None, **x509):
129 proxies = getproxies()
130 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
131 self.proxies = proxies
132 self.key_file = x509.get('key_file')
133 self.cert_file = x509.get('cert_file')
134 self.addheaders = [('User-Agent', self.version)]
135 self.__tempfiles = []
136 self.__unlink = os.unlink # See cleanup()
137 self.tempcache = None
138 # Undocumented feature: if you assign {} to tempcache,
139 # it is used to cache files retrieved with
140 # self.retrieve(). This is not enabled by default
141 # since it does not work for changing documents (and I
142 # haven't got the logic to check expiration headers
144 self.ftpcache = ftpcache
145 # Undocumented feature: you can use a different
146 # ftp cache by assigning to the .ftpcache member;
147 # in case you want logically independent URL openers
148 # XXX This is not threadsafe. Bah.
157 # This code sometimes runs when the rest of this module
158 # has already been deleted, so it can't use any globals
159 # or import anything.
161 for file in self.__tempfiles:
166 del self.__tempfiles[:]
168 self.tempcache.clear()
170 def addheader(self, *args):
171 """Add a header to be used by the HTTP interface only
172 e.g. u.addheader('Accept', 'sound/basic')"""
173 self.addheaders.append(args)
176 def open(self, fullurl, data=None):
177 """Use URLopener().open(file) instead of open(file, 'r')."""
178 fullurl = unwrap(toBytes(fullurl))
179 # percent encode url. fixing lame server errors like space within url
181 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
182 if self.tempcache and fullurl in self.tempcache:
183 filename, headers = self.tempcache[fullurl]
184 fp = open(filename, 'rb')
185 return addinfourl(fp, headers, fullurl)
186 urltype, url = splittype(fullurl)
189 if urltype in self.proxies:
190 proxy = self.proxies[urltype]
191 urltype, proxyhost = splittype(proxy)
192 host, selector = splithost(proxyhost)
193 url = (host, fullurl) # Signal special case to open_*()
196 name = 'open_' + urltype
198 name = name.replace('-', '_')
199 if not hasattr(self, name):
201 return self.open_unknown_proxy(proxy, fullurl, data)
203 return self.open_unknown(fullurl, data)
206 return getattr(self, name)(url)
208 return getattr(self, name)(url, data)
209 except socket.error, msg:
210 raise IOError, ('socket error', msg), sys.exc_info()[2]
212 def open_unknown(self, fullurl, data=None):
213 """Overridable interface to open unknown URL type."""
214 type, url = splittype(fullurl)
215 raise IOError, ('url error', 'unknown url type', type)
217 def open_unknown_proxy(self, proxy, fullurl, data=None):
218 """Overridable interface to open unknown URL type."""
219 type, url = splittype(fullurl)
220 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
223 def retrieve(self, url, filename=None, reporthook=None, data=None):
224 """retrieve(url) returns (filename, headers) for a local object
225 or (tempfilename, headers) for a remote object."""
226 url = unwrap(toBytes(url))
227 if self.tempcache and url in self.tempcache:
228 return self.tempcache[url]
229 type, url1 = splittype(url)
230 if filename is None and (not type or type == 'file'):
232 fp = self.open_local_file(url1)
235 return url2pathname(splithost(url1)[1]), hdrs
238 fp = self.open(url, data)
242 tfp = open(filename, 'wb')
245 garbage, path = splittype(url)
246 garbage, path = splithost(path or "")
247 path, garbage = splitquery(path or "")
248 path, garbage = splitattr(path or "")
249 suffix = os.path.splitext(path)[1]
250 (fd, filename) = tempfile.mkstemp(suffix)
251 self.__tempfiles.append(filename)
252 tfp = os.fdopen(fd, 'wb')
254 result = filename, headers
255 if self.tempcache is not None:
256 self.tempcache[url] = result
262 if "content-length" in headers:
263 size = int(headers["Content-Length"])
264 reporthook(blocknum, bs, size)
273 reporthook(blocknum, bs, size)
281 # raise exception if actual size does not match content-length header
282 if size >= 0 and read < size:
283 raise ContentTooShortError("retrieval incomplete: got only %i out "
284 "of %i bytes" % (read, size), result)
288 # Each method named open_<type> knows how to open that type of URL
290 def open_http(self, url, data=None):
291 """Use HTTP protocol."""
295 if isinstance(url, str):
296 host, selector = splithost(url)
298 user_passwd, host = splituser(host)
303 # check whether the proxy contains authorization information
304 proxy_passwd, host = splituser(host)
305 # now we proceed with the url we want to obtain
306 urltype, rest = splittype(selector)
309 if urltype.lower() != 'http':
312 realhost, rest = splithost(rest)
314 user_passwd, realhost = splituser(realhost)
316 selector = "%s://%s%s" % (urltype, realhost, rest)
317 if proxy_bypass(realhost):
320 #print "proxy via http:", host, selector
321 if not host: raise IOError, ('http error', 'no host given')
325 proxy_auth = base64.b64encode(proxy_passwd).strip()
331 auth = base64.b64encode(user_passwd).strip()
334 h = httplib.HTTP(host)
336 h.putrequest('POST', selector)
337 h.putheader('Content-Type', 'application/x-www-form-urlencoded')
338 h.putheader('Content-Length', '%d' % len(data))
340 h.putrequest('GET', selector)
341 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
342 if auth: h.putheader('Authorization', 'Basic %s' % auth)
343 if realhost: h.putheader('Host', realhost)
344 for args in self.addheaders: h.putheader(*args)
348 errcode, errmsg, headers = h.getreply()
352 # something went wrong with the HTTP status line
353 raise IOError, ('http protocol error', 0,
354 'got a bad status line', None)
355 # According to RFC 2616, "2xx" code indicates that the client's
356 # request was successfully received, understood, and accepted.
357 if (200 <= errcode < 300):
358 return addinfourl(fp, headers, "http:" + url, errcode)
361 return self.http_error(url, fp, errcode, errmsg, headers)
363 return self.http_error(url, fp, errcode, errmsg, headers, data)
365 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
366 """Handle http errors.
367 Derived class can override this, or provide specific handlers
368 named http_error_DDD where DDD is the 3-digit error code."""
369 # First check if there's a specific handler for this error
370 name = 'http_error_%d' % errcode
371 if hasattr(self, name):
372 method = getattr(self, name)
374 result = method(url, fp, errcode, errmsg, headers)
376 result = method(url, fp, errcode, errmsg, headers, data)
377 if result: return result
378 return self.http_error_default(url, fp, errcode, errmsg, headers)
380 def http_error_default(self, url, fp, errcode, errmsg, headers):
381 """Default error handler: close the connection and raise IOError."""
384 raise IOError, ('http error', errcode, errmsg, headers)
387 def open_https(self, url, data=None):
388 """Use HTTPS protocol."""
393 if isinstance(url, str):
394 host, selector = splithost(url)
396 user_passwd, host = splituser(host)
401 # here, we determine, whether the proxy contains authorization information
402 proxy_passwd, host = splituser(host)
403 urltype, rest = splittype(selector)
406 if urltype.lower() != 'https':
409 realhost, rest = splithost(rest)
411 user_passwd, realhost = splituser(realhost)
413 selector = "%s://%s%s" % (urltype, realhost, rest)
414 #print "proxy via https:", host, selector
415 if not host: raise IOError, ('https error', 'no host given')
418 proxy_auth = base64.b64encode(proxy_passwd).strip()
423 auth = base64.b64encode(user_passwd).strip()
426 h = httplib.HTTPS(host, 0,
427 key_file=self.key_file,
428 cert_file=self.cert_file)
430 h.putrequest('POST', selector)
431 h.putheader('Content-Type',
432 'application/x-www-form-urlencoded')
433 h.putheader('Content-Length', '%d' % len(data))
435 h.putrequest('GET', selector)
436 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
437 if auth: h.putheader('Authorization', 'Basic %s' % auth)
438 if realhost: h.putheader('Host', realhost)
439 for args in self.addheaders: h.putheader(*args)
443 errcode, errmsg, headers = h.getreply()
447 # something went wrong with the HTTP status line
448 raise IOError, ('http protocol error', 0,
449 'got a bad status line', None)
450 # According to RFC 2616, "2xx" code indicates that the client's
451 # request was successfully received, understood, and accepted.
452 if (200 <= errcode < 300):
453 return addinfourl(fp, headers, "https:" + url, errcode)
456 return self.http_error(url, fp, errcode, errmsg, headers)
458 return self.http_error(url, fp, errcode, errmsg, headers,
461 def open_file(self, url):
462 """Use local file or FTP depending on form of URL."""
463 if not isinstance(url, str):
464 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
465 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
466 return self.open_ftp(url)
468 return self.open_local_file(url)
470 def open_local_file(self, url):
471 """Use local file."""
472 import mimetypes, mimetools, email.utils
474 from cStringIO import StringIO
476 from StringIO import StringIO
477 host, file = splithost(url)
478 localname = url2pathname(file)
480 stats = os.stat(localname)
482 raise IOError(e.errno, e.strerror, e.filename)
484 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
485 mtype = mimetypes.guess_type(url)[0]
486 headers = mimetools.Message(StringIO(
487 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
488 (mtype or 'text/plain', size, modified)))
492 urlfile = 'file://' + file
493 return addinfourl(open(localname, 'rb'),
495 host, port = splitport(host)
497 and socket.gethostbyname(host) in (localhost(), thishost()):
500 urlfile = 'file://' + file
501 return addinfourl(open(localname, 'rb'),
503 raise IOError, ('local file error', 'not on local host')
505 def open_ftp(self, url):
506 """Use FTP protocol."""
507 if not isinstance(url, str):
508 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
509 import mimetypes, mimetools
511 from cStringIO import StringIO
513 from StringIO import StringIO
514 host, path = splithost(url)
515 if not host: raise IOError, ('ftp error', 'no host given')
516 host, port = splitport(host)
517 user, host = splituser(host)
518 if user: user, passwd = splitpasswd(user)
521 user = unquote(user or '')
522 passwd = unquote(passwd or '')
523 host = socket.gethostbyname(host)
526 port = ftplib.FTP_PORT
529 path, attrs = splitattr(path)
531 dirs = path.split('/')
532 dirs, file = dirs[:-1], dirs[-1]
533 if dirs and not dirs[0]: dirs = dirs[1:]
534 if dirs and not dirs[0]: dirs[0] = '/'
535 key = user, host, port, '/'.join(dirs)
537 if len(self.ftpcache) > MAXFTPCACHE:
538 # Prune the cache, rather arbitrarily
539 for k in self.ftpcache.keys():
545 if not key in self.ftpcache:
546 self.ftpcache[key] = \
547 ftpwrapper(user, passwd, host, port, dirs)
548 if not file: type = 'D'
551 attr, value = splitvalue(attr)
552 if attr.lower() == 'type' and \
553 value in ('a', 'A', 'i', 'I', 'd', 'D'):
555 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
556 mtype = mimetypes.guess_type("ftp:" + url)[0]
559 headers += "Content-Type: %s\n" % mtype
560 if retrlen is not None and retrlen >= 0:
561 headers += "Content-Length: %d\n" % retrlen
562 headers = mimetools.Message(StringIO(headers))
563 return addinfourl(fp, headers, "ftp:" + url)
564 except ftperrors(), msg:
565 raise IOError, ('ftp error', msg), sys.exc_info()[2]
567 def open_data(self, url, data=None):
568 """Use "data" URL."""
569 if not isinstance(url, str):
570 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
573 # syntax of data URLs:
574 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
575 # mediatype := [ type "/" subtype ] *( ";" parameter )
577 # parameter := attribute "=" value
580 from cStringIO import StringIO
582 from StringIO import StringIO
584 [type, data] = url.split(',', 1)
586 raise IOError, ('data error', 'bad data URL')
588 type = 'text/plain;charset=US-ASCII'
589 semi = type.rfind(';')
590 if semi >= 0 and '=' not in type[semi:]:
591 encoding = type[semi+1:]
596 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
597 time.gmtime(time.time())))
598 msg.append('Content-type: %s' % type)
599 if encoding == 'base64':
601 data = base64.decodestring(data)
604 msg.append('Content-Length: %d' % len(data))
609 headers = mimetools.Message(f, 0)
610 #f.fileno = None # needed for addinfourl
611 return addinfourl(f, headers, url)
614 class FancyURLopener(URLopener):
615 """Derived class with handlers for errors we can handle (perhaps)."""
617 def __init__(self, *args, **kwargs):
618 URLopener.__init__(self, *args, **kwargs)
623 def http_error_default(self, url, fp, errcode, errmsg, headers):
624 """Default error handling -- don't raise an exception."""
625 return addinfourl(fp, headers, "http:" + url, errcode)
627 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
628 """Error 302 -- relocated (temporarily)."""
630 if self.maxtries and self.tries >= self.maxtries:
631 if hasattr(self, "http_error_500"):
632 meth = self.http_error_500
634 meth = self.http_error_default
636 return meth(url, fp, 500,
637 "Internal Server Error: Redirect Recursion", headers)
638 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
643 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
644 if 'location' in headers:
645 newurl = headers['location']
646 elif 'uri' in headers:
647 newurl = headers['uri']
652 # In case the server sent a relative URL, join with original:
653 newurl = basejoin(self.type + ":" + url, newurl)
654 return self.open(newurl)
656 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
657 """Error 301 -- also relocated (permanently)."""
658 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
660 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
661 """Error 303 -- also relocated (essentially identical to 302)."""
662 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
664 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
665 """Error 307 -- relocated, but turn POST into error."""
667 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
669 return self.http_error_default(url, fp, errcode, errmsg, headers)
671 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
672 """Error 401 -- authentication required.
673 This function supports Basic authentication only."""
674 if not 'www-authenticate' in headers:
675 URLopener.http_error_default(self, url, fp,
676 errcode, errmsg, headers)
677 stuff = headers['www-authenticate']
679 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
681 URLopener.http_error_default(self, url, fp,
682 errcode, errmsg, headers)
683 scheme, realm = match.groups()
684 if scheme.lower() != 'basic':
685 URLopener.http_error_default(self, url, fp,
686 errcode, errmsg, headers)
687 name = 'retry_' + self.type + '_basic_auth'
689 return getattr(self,name)(url, realm)
691 return getattr(self,name)(url, realm, data)
693 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
694 """Error 407 -- proxy authentication required.
695 This function supports Basic authentication only."""
696 if not 'proxy-authenticate' in headers:
697 URLopener.http_error_default(self, url, fp,
698 errcode, errmsg, headers)
699 stuff = headers['proxy-authenticate']
701 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
703 URLopener.http_error_default(self, url, fp,
704 errcode, errmsg, headers)
705 scheme, realm = match.groups()
706 if scheme.lower() != 'basic':
707 URLopener.http_error_default(self, url, fp,
708 errcode, errmsg, headers)
709 name = 'retry_proxy_' + self.type + '_basic_auth'
711 return getattr(self,name)(url, realm)
713 return getattr(self,name)(url, realm, data)
715 def retry_proxy_http_basic_auth(self, url, realm, data=None):
716 host, selector = splithost(url)
717 newurl = 'http://' + host + selector
718 proxy = self.proxies['http']
719 urltype, proxyhost = splittype(proxy)
720 proxyhost, proxyselector = splithost(proxyhost)
721 i = proxyhost.find('@') + 1
722 proxyhost = proxyhost[i:]
723 user, passwd = self.get_user_passwd(proxyhost, realm, i)
724 if not (user or passwd): return None
725 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
726 self.proxies['http'] = 'http://' + proxyhost + proxyselector
728 return self.open(newurl)
730 return self.open(newurl, data)
732 def retry_proxy_https_basic_auth(self, url, realm, data=None):
733 host, selector = splithost(url)
734 newurl = 'https://' + host + selector
735 proxy = self.proxies['https']
736 urltype, proxyhost = splittype(proxy)
737 proxyhost, proxyselector = splithost(proxyhost)
738 i = proxyhost.find('@') + 1
739 proxyhost = proxyhost[i:]
740 user, passwd = self.get_user_passwd(proxyhost, realm, i)
741 if not (user or passwd): return None
742 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
743 self.proxies['https'] = 'https://' + proxyhost + proxyselector
745 return self.open(newurl)
747 return self.open(newurl, data)
749 def retry_http_basic_auth(self, url, realm, data=None):
750 host, selector = splithost(url)
751 i = host.find('@') + 1
753 user, passwd = self.get_user_passwd(host, realm, i)
754 if not (user or passwd): return None
755 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
756 newurl = 'http://' + host + selector
758 return self.open(newurl)
760 return self.open(newurl, data)
762 def retry_https_basic_auth(self, url, realm, data=None):
763 host, selector = splithost(url)
764 i = host.find('@') + 1
766 user, passwd = self.get_user_passwd(host, realm, i)
767 if not (user or passwd): return None
768 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
769 newurl = 'https://' + host + selector
771 return self.open(newurl)
773 return self.open(newurl, data)
775 def get_user_passwd(self, host, realm, clear_cache = 0):
776 key = realm + '@' + host.lower()
777 if key in self.auth_cache:
779 del self.auth_cache[key]
781 return self.auth_cache[key]
782 user, passwd = self.prompt_user_passwd(host, realm)
783 if user or passwd: self.auth_cache[key] = (user, passwd)
786 def prompt_user_passwd(self, host, realm):
787 """Override this in a GUI environment!"""
790 user = raw_input("Enter username for %s at %s: " % (realm,
792 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
795 except KeyboardInterrupt:
804 """Return the IP address of the magic hostname 'localhost'."""
806 if _localhost is None:
807 _localhost = socket.gethostbyname('localhost')
812 """Return the IP address of the current host."""
814 if _thishost is None:
815 _thishost = socket.gethostbyname(socket.gethostname())
820 """Return the set of errors raised by the FTP class."""
822 if _ftperrors is None:
824 _ftperrors = ftplib.all_errors
829 """Return an empty mimetools.Message object."""
831 if _noheaders is None:
834 from cStringIO import StringIO
836 from StringIO import StringIO
837 _noheaders = mimetools.Message(StringIO(), 0)
838 _noheaders.fp.close() # Recycle file descriptor
845 """Class used by open_ftp() for cache of open FTP connections."""
847 def __init__(self, user, passwd, host, port, dirs,
848 timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
854 self.timeout = timeout
860 self.ftp = ftplib.FTP()
861 self.ftp.connect(self.host, self.port, self.timeout)
862 self.ftp.login(self.user, self.passwd)
863 for dir in self.dirs:
866 def retrfile(self, file, type):
869 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
870 else: cmd = 'TYPE ' + type; isdir = 0
872 self.ftp.voidcmd(cmd)
873 except ftplib.all_errors:
875 self.ftp.voidcmd(cmd)
877 if file and not isdir:
878 # Try to retrieve as a file
881 conn = self.ftp.ntransfercmd(cmd)
882 except ftplib.error_perm, reason:
883 if str(reason)[:3] != '550':
884 raise IOError, ('ftp error', reason), sys.exc_info()[2]
886 # Set transfer mode to ASCII!
887 self.ftp.voidcmd('TYPE A')
888 # Try a directory listing. Verify that directory exists.
894 except ftplib.error_perm, reason:
895 raise IOError, ('ftp error', reason), sys.exc_info()[2]
901 conn = self.ftp.ntransfercmd(cmd)
903 # Pass back both a suitably decorated object and a retrieval length
904 return (addclosehook(conn[0].makefile('rb'),
905 self.endtransfer), conn[1])
906 def endtransfer(self):
923 """Base class for addinfo and addclosehook."""
925 def __init__(self, fp):
927 self.read = self.fp.read
928 self.readline = self.fp.readline
929 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
930 if hasattr(self.fp, "fileno"):
931 self.fileno = self.fp.fileno
933 self.fileno = lambda: None
934 if hasattr(self.fp, "__iter__"):
935 self.__iter__ = self.fp.__iter__
936 if hasattr(self.fp, "next"):
937 self.next = self.fp.next
940 return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
946 self.readlines = None
948 if self.fp: self.fp.close()
951 class addclosehook(addbase):
952 """Class to add a close hook to an open file."""
954 def __init__(self, fp, closehook, *hookargs):
955 addbase.__init__(self, fp)
956 self.closehook = closehook
957 self.hookargs = hookargs
962 self.closehook(*self.hookargs)
963 self.closehook = None
966 class addinfo(addbase):
967 """class to add an info() method to an open file."""
969 def __init__(self, fp, headers):
970 addbase.__init__(self, fp)
971 self.headers = headers
976 class addinfourl(addbase):
977 """class to add info() and geturl() methods to an open file."""
979 def __init__(self, fp, headers, url, code=None):
980 addbase.__init__(self, fp)
981 self.headers = headers
995 # Utilities to parse URLs (most of these return None for missing parts):
996 # unwrap('<URL:type://host/path>') --> 'type://host/path'
997 # splittype('type:opaquestring') --> 'type', 'opaquestring'
998 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
999 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
1000 # splitpasswd('user:passwd') -> 'user', 'passwd'
1001 # splitport('host:port') --> 'host', 'port'
1002 # splitquery('/path?query') --> '/path', 'query'
1003 # splittag('/path#tag') --> '/path', 'tag'
1004 # splitattr('/path;attr1=value1;attr2=value2;...') ->
1005 # '/path', ['attr1=value1', 'attr2=value2', ...]
1006 # splitvalue('attr=value') --> 'attr', 'value'
1007 # unquote('abc%20def') -> 'abc def'
1008 # quote('abc def') -> 'abc%20def')
1017 return isinstance(x, unicode)
1020 """toBytes(u"URL") --> 'URL'."""
1021 # Most URL schemes require ASCII. If that changes, the conversion
1023 if _is_unicode(url):
1025 url = url.encode("ASCII")
1026 except UnicodeError:
1027 raise UnicodeError("URL " + repr(url) +
1028 " contains non-ASCII characters")
1032 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1034 if url[:1] == '<' and url[-1:] == '>':
1035 url = url[1:-1].strip()
1036 if url[:4] == 'URL:': url = url[4:].strip()
1041 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1043 if _typeprog is None:
1045 _typeprog = re.compile('^([^/:]+):')
1047 match = _typeprog.match(url)
1049 scheme = match.group(1)
1050 return scheme.lower(), url[len(scheme) + 1:]
1055 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1057 if _hostprog is None:
1059 _hostprog = re.compile('^//([^/?]*)(.*)$')
1061 match = _hostprog.match(url)
1062 if match: return match.group(1, 2)
1066 def splituser(host):
1067 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1069 if _userprog is None:
1071 _userprog = re.compile('^(.*)@(.*)$')
1073 match = _userprog.match(host)
1074 if match: return map(unquote, match.group(1, 2))
1078 def splitpasswd(user):
1079 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1081 if _passwdprog is None:
1083 _passwdprog = re.compile('^([^:]*):(.*)$')
1085 match = _passwdprog.match(user)
1086 if match: return match.group(1, 2)
1089 # splittag('/path#tag') --> '/path', 'tag'
1091 def splitport(host):
1092 """splitport('host:port') --> 'host', 'port'."""
1094 if _portprog is None:
1096 _portprog = re.compile('^(.*):([0-9]+)$')
1098 match = _portprog.match(host)
1099 if match: return match.group(1, 2)
1103 def splitnport(host, defport=-1):
1104 """Split host and port, returning numeric port.
1105 Return given default port if no ':' found; defaults to -1.
1106 Return numerical port if a valid number are found after ':'.
1107 Return None if ':' but not a valid number."""
1109 if _nportprog is None:
1111 _nportprog = re.compile('^(.*):(.*)$')
1113 match = _nportprog.match(host)
1115 host, port = match.group(1, 2)
1117 if not port: raise ValueError, "no digits"
1122 return host, defport
1125 def splitquery(url):
1126 """splitquery('/path?query') --> '/path', 'query'."""
1128 if _queryprog is None:
1130 _queryprog = re.compile('^(.*)\?([^?]*)$')
1132 match = _queryprog.match(url)
1133 if match: return match.group(1, 2)
1138 """splittag('/path#tag') --> '/path', 'tag'."""
1140 if _tagprog is None:
1142 _tagprog = re.compile('^(.*)#([^#]*)$')
1144 match = _tagprog.match(url)
1145 if match: return match.group(1, 2)
1149 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1150 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1151 words = url.split(';')
1152 return words[0], words[1:]
1155 def splitvalue(attr):
1156 """splitvalue('attr=value') --> 'attr', 'value'."""
1158 if _valueprog is None:
1160 _valueprog = re.compile('^([^=]*)=(.*)$')
1162 match = _valueprog.match(attr)
1163 if match: return match.group(1, 2)
1166 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1167 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1170 """unquote('abc%20def') -> 'abc def'."""
1172 for i in xrange(1, len(res)):
1175 res[i] = _hextochr[item[:2]] + item[2:]
1178 except UnicodeDecodeError:
1179 res[i] = unichr(int(item[:2], 16)) + item[2:]
1182 def unquote_plus(s):
1183 """unquote('%7e/abc+def') -> '~/abc def'"""
1184 s = s.replace('+', ' ')
1187 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1188 'abcdefghijklmnopqrstuvwxyz'
1192 def quote(s, safe = '/'):
1193 """quote('abc def') -> 'abc%20def'
1195 Each part of a URL, e.g. the path info, the query, etc., has a
1196 different set of reserved characters that must be quoted.
1198 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1199 the following reserved characters.
1201 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1204 Each of these characters is reserved in some component of a URL,
1205 but not necessarily in all of them.
1207 By default, the quote function is intended for quoting the path
1208 section of a URL. Thus, it will not encode '/'. This character
1209 is reserved, but in typical usage the quote function is being
1210 called on a path where the existing slash characters are used as
1211 reserved characters.
1213 cachekey = (safe, always_safe)
1215 safe_map = _safemaps[cachekey]
1219 for i in range(256):
1221 safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1222 _safemaps[cachekey] = safe_map
1223 res = map(safe_map.__getitem__, s)
1226 def quote_plus(s, safe = ''):
1227 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1229 s = quote(s, safe + ' ')
1230 return s.replace(' ', '+')
1231 return quote(s, safe)
1233 def urlencode(query,doseq=0):
1234 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1236 If any values in the query arg are sequences and doseq is true, each
1237 sequence element is converted to a separate parameter.
1239 If the query arg is a sequence of two-element tuples, the order of the
1240 parameters in the output will match the order of parameters in the
1244 if hasattr(query,"items"):
1246 query = query.items()
1248 # it's a bother at times that strings and string-like objects are
1251 # non-sequence items should not work with len()
1252 # non-empty strings will fail this
1253 if len(query) and not isinstance(query[0], tuple):
1255 # zero-length sequences of all types will get here and succeed,
1256 # but that's a minor nit - since the original implementation
1257 # allowed empty dicts that type of behavior probably should be
1258 # preserved for consistency
1260 ty,va,tb = sys.exc_info()
1261 raise TypeError, "not a valid non-string sequence or mapping object", tb
1265 # preserve old behavior
1267 k = quote_plus(str(k))
1268 v = quote_plus(str(v))
1269 l.append(k + '=' + v)
1272 k = quote_plus(str(k))
1273 if isinstance(v, str):
1275 l.append(k + '=' + v)
1276 elif _is_unicode(v):
1277 # is there a reasonable way to convert to ASCII?
1278 # encode generates a string, but "replace" or "ignore"
1279 # lose information and "strict" can raise UnicodeError
1280 v = quote_plus(v.encode("ASCII","replace"))
1281 l.append(k + '=' + v)
1284 # is this a sufficient test for sequence-ness?
1288 v = quote_plus(str(v))
1289 l.append(k + '=' + v)
1291 # loop over the sequence
1293 l.append(k + '=' + quote_plus(str(elt)))
1297 def getproxies_environment():
1298 """Return a dictionary of scheme -> proxy server URL mappings.
1300 Scan the environment for variables named <scheme>_proxy;
1301 this seems to be the standard convention. If you need a
1302 different way, you can pass a proxies dictionary to the
1303 [Fancy]URLopener constructor.
1307 for name, value in os.environ.items():
1309 if value and name[-6:] == '_proxy':
1310 proxies[name[:-6]] = value
1313 def proxy_bypass_environment(host):
1314 """Test if proxies should not be used for a particular host.
1316 Checks the environment for a variable named no_proxy, which should
1317 be a list of DNS suffixes separated by commas, or '*' for all hosts.
1319 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1320 # '*' is special case for always bypass
1323 # strip port off host
1324 hostonly, port = splitport(host)
1325 # check if the host ends with any of the DNS suffixes
1326 for name in no_proxy.split(','):
1327 if name and (hostonly.endswith(name) or host.endswith(name)):
1329 # otherwise, don't bypass
1333 if sys.platform == 'darwin':
1334 from _scproxy import _get_proxy_settings, _get_proxies
1336 def proxy_bypass_macosx_sysconf(host):
1338 Return True iff this host shouldn't be accessed using a proxy
1340 This function uses the MacOSX framework SystemConfiguration
1341 to fetch the proxy information.
1345 from fnmatch import fnmatch
1347 hostonly, port = splitport(host)
1350 parts = ipAddr.split('.')
1351 parts = map(int, parts)
1353 parts = (parts + [0, 0, 0, 0])[:4]
1354 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1356 proxy_settings = _get_proxy_settings()
1358 # Check for simple host names:
1360 if proxy_settings['exclude_simple']:
1365 for value in proxy_settings.get('exceptions', ()):
1366 # Items in the list are strings like these: *.local, 169.254/16
1367 if not value: continue
1369 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1373 hostIP = socket.gethostbyname(hostonly)
1374 hostIP = ip2num(hostIP)
1375 except socket.error:
1378 base = ip2num(m.group(1))
1379 mask = int(m.group(2)[1:])
1382 if (hostIP >> mask) == (base >> mask):
1385 elif fnmatch(host, value):
1391 def getproxies_macosx_sysconf():
1392 """Return a dictionary of scheme -> proxy server URL mappings.
1394 This function uses the MacOSX framework SystemConfiguration
1395 to fetch the proxy information.
1397 return _get_proxies()
1401 def proxy_bypass(host):
1402 if getproxies_environment():
1403 return proxy_bypass_environment(host)
1405 return proxy_bypass_macosx_sysconf(host)
1408 return getproxies_environment() or getproxies_macosx_sysconf()
1410 elif os.name == 'nt':
1411 def getproxies_registry():
1412 """Return a dictionary of scheme -> proxy server URL mappings.
1414 Win32 uses the registry to store proxies.
1421 # Std module, so should be around - but you never know!
1424 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1425 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1426 proxyEnable = _winreg.QueryValueEx(internetSettings,
1429 # Returned as Unicode but problems if not converted to ASCII
1430 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1432 if '=' in proxyServer:
1433 # Per-protocol settings
1434 for p in proxyServer.split(';'):
1435 protocol, address = p.split('=', 1)
1436 # See if address has a type:// prefix
1438 if not re.match('^([^/:]+)://', address):
1439 address = '%s://%s' % (protocol, address)
1440 proxies[protocol] = address
1442 # Use one setting for all protocols
1443 if proxyServer[:5] == 'http:':
1444 proxies['http'] = proxyServer
1446 proxies['http'] = 'http://%s' % proxyServer
1447 proxies['ftp'] = 'ftp://%s' % proxyServer
1448 internetSettings.Close()
1449 except (WindowsError, ValueError, TypeError):
1450 # Either registry key not found etc, or the value in an
1451 # unexpected format.
1452 # proxies already set up to be empty so nothing to do
1457 """Return a dictionary of scheme -> proxy server URL mappings.
1459 Returns settings gathered from the environment, if specified,
1463 return getproxies_environment() or getproxies_registry()
1465 def proxy_bypass_registry(host):
1470 # Std modules, so should be around - but you never know!
1473 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1474 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1475 proxyEnable = _winreg.QueryValueEx(internetSettings,
1477 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1478 'ProxyOverride')[0])
1479 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1480 except WindowsError:
1482 if not proxyEnable or not proxyOverride:
1484 # try to make a host list from name and IP address.
1485 rawHost, port = splitport(host)
1488 addr = socket.gethostbyname(rawHost)
1491 except socket.error:
1494 fqdn = socket.getfqdn(rawHost)
1497 except socket.error:
1499 # make a check value list from the registry entry: replace the
1500 # '<local>' string by the localhost entry and the corresponding
1502 proxyOverride = proxyOverride.split(';')
1504 while i < len(proxyOverride):
1505 if proxyOverride[i] == '<local>':
1506 proxyOverride[i:i+1] = ['localhost',
1508 socket.gethostname(),
1509 socket.gethostbyname(
1510 socket.gethostname())]
1512 # print proxyOverride
1513 # now check if we match one of the registry values.
1514 for test in proxyOverride:
1515 test = test.replace(".", r"\.") # mask dots
1516 test = test.replace("*", r".*") # change glob sequence
1517 test = test.replace("?", r".") # change glob char
1519 # print "%s <--> %s" %( test, val )
1520 if re.match(test, val, re.I):
1524 def proxy_bypass(host):
1525 """Return a dictionary of scheme -> proxy server URL mappings.
1527 Returns settings gathered from the environment, if specified,
1531 if getproxies_environment():
1532 return proxy_bypass_environment(host)
1534 return proxy_bypass_registry(host)
1537 # By default use environment variables
1538 getproxies = getproxies_environment
1539 proxy_bypass = proxy_bypass_environment
1541 # Test and time quote() and unquote()
1544 for i in range(256): s = s + chr(i)
1555 print round(t1 - t0, 3), 'sec'
1558 def reporthook(blocknum, blocksize, totalsize):
1559 # Report during remote transfers
1560 print "Block number: %d, Block size: %d, Total size: %d" % (
1561 blocknum, blocksize, totalsize)
1569 'file://localhost/etc/passwd',
1570 'ftp://ftp.gnu.org/pub/README',
1571 'http://www.python.org/index.html',
1573 if hasattr(URLopener, "open_https"):
1574 args.append('https://synergy.as.cmu.edu/~geek/')
1577 print '-'*10, url, '-'*10
1578 fn, h = urlretrieve(url, None, reporthook)
1582 for k in h.keys(): print k + ':', h[k]
1588 table = string.maketrans("", "")
1589 data = data.translate(table, "\r")
1599 opts, args = getopt.getopt(sys.argv[1:], "th")
1600 except getopt.error, msg:
1602 print "Use -h for help"
1609 print "Usage: python urllib.py [-t] [url ...]"
1610 print "-t runs self-test;",
1611 print "otherwise, contents of urls are printed"
1619 print "Use -h for help"
1621 print urlopen(url).read(),
1623 # Run test program when run as a script
1624 if __name__ == '__main__':