Add the conditional node checking during node cleaning

pull/4/merge
Richard Harding 12 years ago
parent 14bbe701eb
commit 7d2eec8f52

@ -38,6 +38,16 @@ def drop_tag(doc, *tags):
return doc
def ok_embedded_video(node):
"""Check if this embed/video is an ok one to count."""
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
node_str = tounicode(n)
for key in keep_keywords:
if not allow and key in node_str:
return True
return False
def build_base_document(html):
"""Return a base document with the body as root.
@ -158,7 +168,6 @@ def prep_article(doc):
"""Clean up the final document we return as the readable article"""
LOG.debug('Cleaning document')
clean_list = ['object', 'h1']
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
# If there is only one h2, they are probably using it as a header and
# not a subheader, so remove it since we already have a header.
@ -181,13 +190,9 @@ def prep_article(doc):
# Allow youtube and vimeo videos through as people usually
# want to see those.
if is_embed:
# if this object or embed has any of the keywords in the
# html from here on out, then let it live.
node_str = tounicode(n)
if ok_embedded_video(n):
allow = True
for key in keep_keywords:
if not allow and key in node_str:
allow = True
if not allow:
LOG.debug('Dropping node: ' + str(n))
n.drop_tree()
@ -207,7 +212,6 @@ def prep_article(doc):
# go on with next loop, this guy is gone
continue
# clean out extra <p>
if n.tag == 'p':
# if the p has no children and has no content...well then down
@ -217,10 +221,82 @@ def prep_article(doc):
n.drop_tree()
# go on with next loop, this guy is gone
continue
# finally try out the conditional cleaning of the target node
clean_conditionally(n)
return node
def clean_conditionally(doc, clean_el):
def clean_conditionally(node):
"""Remove the clean_el if it looks like bad content based on rules."""
target_tags = ['form', 'table', 'ul', 'div', 'p']
if node.tag not in target_tags:
# this is not the tag you're looking for
return
weight = get_class_weight(node)
# content_score = LOOK up the content score for this node we found
# before else default to 0
content_score = 0
if (weight + content_score < 0):
LOG.debug('Dropping conditional node: ' + str(node))
node.drop_tree()
if node.text_content().count(',') < 10:
LOG.debug("There aren't 10 ,s so we're processing more")
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other ominous
# signs, remove the element.
p = len(node.findall('.//p'))
img = len(node.findall('.//img'))
li = len(node.findall('.//li')) - 100
inputs = len(node.findall('.//input'))
embed = 0
embeds = node.findall('.//embed')
for e in embeds:
if ok_embedded_video(e):
embed += 1
link_density = get_link_density(node)
content_length = len(node.text_content())
remove_node = False
if img > p:
# this one has shown to do some extra image removals.
# we could get around this by checking for caption info in the
# images to try to do some scoring of good v. bad images.
# failing example:
# arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
LOG.debug('Conditional drop: img > p')
remove_node = True
elif li > p and node.tag != 'ul' and node.tag != 'ol':
LOG.debug('Conditional drop: li > p and not ul/ol')
remove_node = True
elif inputs > p / 3.0:
LOG.debug('Conditional drop: inputs > p/3.0')
remove_node = True
elif content_length < 25 and (img == 0 or img > 2):
LOG.debug('Conditional drop: len < 25 and 0/>2 images')
remove_node = True
elif weight < 25 and link_density > 0.2:
LOG.debug('Conditional drop: weight small and link is dense')
remove_node = True
elif weight >= 25 and link_density > 0.5:
LOG.debug('Conditional drop: weight big but link heavy')
remove_node = True
elif (embed == 1 and content_length < 75) or embed > 1:
LOG.debug('Conditional drop: embed without much content or many embed')
remove_node = True
if remove_node:
# For some reason the parent is none so we can't drop, we're
# not in a tree that can take dropping this node.
if node.getparent() is not None:
node.drop_tree()
doc = clean_document(doc)
return doc

@ -0,0 +1,532 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<meta name="robots" content="index,nofollow">
<title>PythonSpeed/PerformanceTips - PythonInfo Wiki</title>
<script type="text/javascript" src="/moin/moin_static193/common/js/common.js"></script>
<script type="text/javascript">
<!--
var search_hint = "Search";
//-->
</script>
<link rel="stylesheet" type="text/css" charset="utf-8" media="all" href="/moin/moin_static193/europython/css/common.css">
<link rel="stylesheet" type="text/css" charset="utf-8" media="screen" href="/moin/moin_static193/europython/css/screen.css">
<link rel="stylesheet" type="text/css" charset="utf-8" media="print" href="/moin/moin_static193/europython/css/print.css">
<link rel="stylesheet" type="text/css" charset="utf-8" media="projection" href="/moin/moin_static193/europython/css/projection.css">
<!-- css only for MS IE6/IE7 browsers -->
<!--[if lt IE 8]>
<link rel="stylesheet" type="text/css" charset="utf-8" media="all" href="/moin/moin_static193/europython/css/msie.css">
<![endif]-->
<link rel="Start" href="/moin/FrontPage">
<link rel="Alternate" title="Wiki Markup" href="/moin/PythonSpeed/PerformanceTips?action=raw">
<link rel="Alternate" media="print" title="Print View" href="/moin/PythonSpeed/PerformanceTips?action=print">
<link rel="Up" href="/moin/PythonSpeed">
<link rel="Search" href="/moin/FindPage">
<link rel="Index" href="/moin/TitleIndex">
<link rel="Glossary" href="/moin/WordIndex">
<link rel="Help" href="/moin/HelpOnFormatting">
</head>
<body lang="en" dir="ltr">
<div id="header">
<form id="searchform" method="get" action="/moin/PythonSpeed/PerformanceTips">
<div>
<input type="hidden" name="action" value="fullsearch">
<input type="hidden" name="context" value="180">
<label for="searchinput">Search:</label>
<input id="searchinput" type="text" name="value" value="" size="20"
onfocus="searchFocus(this)" onblur="searchBlur(this)"
onkeyup="searchChange(this)" onchange="searchChange(this)" alt="Search">
<input id="titlesearch" name="titlesearch" type="submit"
value="Titles" alt="Search Titles">
<input id="fullsearch" name="fullsearch" type="submit"
value="Text" alt="Search Full Text">
</div>
</form>
<script type="text/javascript">
<!--// Initialize search form
var f = document.getElementById('searchform');
f.getElementsByTagName('label')[0].style.display = 'none';
var e = document.getElementById('searchinput');
searchChange(e);
searchBlur(e);
//-->
</script>
<div id="logo"><a href="/moin/FrontPage"></a><a href="http://python.org"><img src="http://www.python.org/images/python-logo.gif" alt="Python" ></a><a name="logo"></a></div>
<div id="locationline">
<ul id="pagelocation">
<li><a href="/moin/PythonSpeed">PythonSpeed</a></li><li><a class="backlink" href="/moin/PythonSpeed/PerformanceTips?action=fullsearch&amp;context=180&amp;value=linkto%3A%22PythonSpeed%2FPerformanceTips%22" rel="nofollow" title="Click to do a full-text search for this title">PerformanceTips</a></li>
</ul>
</div>
<ul id="pagetrail">
<li><a href="/moin/PythonSpeed/PerformanceTips">PerformanceTips</a></li>
</ul>
</div>
<div id="sidebar">
<div id="star">
</div>
<div class="sidepanel">
<ul id="navibar">
<li class="wikilink"><a href="/moin/FrontPage">FrontPage</a></li><li class="wikilink"><a href="/moin/RecentChanges">RecentChanges</a></li><li class="wikilink"><a href="/moin/FindPage">FindPage</a></li><li class="wikilink"><a href="/moin/HelpContents">HelpContents</a></li><li class="current"><a href="/moin/PythonSpeed/PerformanceTips">PerformanceTips</a></li>
</ul>
</div>
<div class="sidepanel">
<h1>Page</h1>
<ul class="editbar"><li><span class="disabled">Immutable Page</span></li><li class="toggleCommentsButton" style="display:none;"><a href="#" class="nbcomment" onClick="toggleComments();return false;">Comments</a></li><li><a class="nbinfo" href="/moin/PythonSpeed/PerformanceTips?action=info" rel="nofollow">Info</a></li><li><a class="nbattachments" href="/moin/PythonSpeed/PerformanceTips?action=AttachFile" rel="nofollow">Attachments</a></li><li>
<form class="actionsmenu" method="GET" action="/moin/PythonSpeed/PerformanceTips">
<div>
<label>More Actions:</label>
<select name="action"
onchange="if ((this.selectedIndex != 0) &&
(this.options[this.selectedIndex].disabled == false)) {
this.form.submit();
}
this.selectedIndex = 0;">
<option value="raw">Raw Text</option>
<option value="print">Print View</option>
<option value="refresh">Delete Cache</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="SpellCheck">Check Spelling</option>
<option value="LikePages">Like Pages</option>
<option value="LocalSiteMap">Local Site Map</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="RenamePage" disabled class="disabled">Rename Page</option>
<option value="DeletePage" disabled class="disabled">Delete Page</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="show" disabled class="disabled">Remove Spam</option>
<option value="show" disabled class="disabled">Revert to this revision</option>
<option value="show" disabled class="disabled">------------------------</option>
<option value="Load">Load</option>
<option value="SlideShow">SlideShow</option>
</select>
<input type="submit" value="Do">
</div>
<script type="text/javascript">
<!--// Init menu
actionsMenuInit('More Actions:');
//-->
</script>
</form>
</li></ul>
</div>
<div class="sidepanel">
<h1>User</h1>
<ul id="username"><li><a href="/moin/PythonSpeed/PerformanceTips?action=login" id="login" rel="nofollow">Login</a></li></ul>
</div>
</div>
<div id="page" lang="en" dir="ltr">
<div dir="ltr" id="content" lang="en"><span class="anchor" id="top"></span>
<span class="anchor" id="line-1"></span><span class="anchor" id="line-2"></span><span class="anchor" id="line-3"></span><p class="line867"><div class="table-of-contents"><p class="table-of-contents-heading">Contents<ol><li>
<a href="#Other_Versions">Other Versions</a></li><li>
<a href="#Overview:_Optimize_what_needs_optimizing">Overview: Optimize what needs optimizing</a></li><li>
<a href="#Choose_the_Right_Data_Structure">Choose the Right Data Structure</a></li><li>
<a href="#Sorting">Sorting</a></li><li>
<a href="#String_Concatenation">String Concatenation</a></li><li>
<a href="#Loops">Loops</a></li><li>
<a href="#Avoiding_dots...">Avoiding dots...</a></li><li>
<a href="#Local_Variables">Local Variables</a></li><li>
<a href="#Initializing_Dictionary_Elements">Initializing Dictionary Elements</a></li><li>
<a href="#Import_Statement_Overhead">Import Statement Overhead</a></li><li>
<a href="#Data_Aggregation">Data Aggregation</a></li><li>
<a href="#Doing_Stuff_Less_Often">Doing Stuff Less Often</a></li><li>
<a href="#Python_is_not_C">Python is not C</a></li><li>
<a href="#Use_xrange_instead_of_range">Use xrange instead of range</a></li><li>
<a href="#Re-map_Functions_at_runtime">Re-map Functions at runtime</a></li><li>
<a href="#Profiling_Code">Profiling Code</a><ol><li>
<a href="#Profiling">Profiling</a></li><li>
<a href="#The_cProfile_and_Hotshot_Modules">The cProfile and Hotshot Modules</a></li><li>
<a href="#Trace_Module">Trace Module</a></li><li>
<a href="#Visualizing_Profiling_Results">Visualizing Profiling Results</a></li></ol></li></ol></li></ol></div> <span class="anchor" id="line-4"></span><span class="anchor" id="line-5"></span><p class="line874">This page is devoted to various tips and tricks that help improve the <span class="anchor" id="line-6"></span>performance of your Python programs. Wherever the information comes from <span class="anchor" id="line-7"></span>someone else, I've tried to identify the source. <span class="anchor" id="line-8"></span><span class="anchor" id="line-9"></span><p class="line874">Python has changed in some significant ways since I first wrote my "fast <span class="anchor" id="line-10"></span>python" page in about 1996, which means that some of the orderings will have <span class="anchor" id="line-11"></span>changed. I migrated it to the Python wiki in hopes others will help maintain <span class="anchor" id="line-12"></span>it. <span class="anchor" id="line-13"></span><span class="anchor" id="line-14"></span><p class="line867"><span class="anchor" id="line-15"></span><span class="anchor" id="line-16"></span><span class="anchor" id="line-17"></span><span class="anchor" id="line-18"></span><span class="anchor" id="line-19"></span><div class="tip"><span class="anchor" id="line-1-1"></span><p class="line874">You should always test these tips with your application and the specific version of the Python <span class="anchor" id="line-2-1"></span><a href="/moin/PythonImplementations">implementation</a> you intend to use and not just blindly accept that one <span class="anchor" id="line-3-1"></span>method is faster than another. See the <a href="/moin/PythonSpeed/PerformanceTips#Profiling">profiling</a> section for more details. </div><span class="anchor" id="line-20"></span><span class="anchor" id="line-21"></span><p class="line874">Also new since this was originally written are packages like <span class="anchor" id="line-22"></span><a class="http" href="http://cython.org/">Cython</a>, <span class="anchor" id="line-23"></span><a class="http" href="http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/">Pyrex</a>, <span class="anchor" id="line-24"></span><a class="http" href="http://psyco.sourceforge.net/">Psyco</a>, <span class="anchor" id="line-25"></span><a class="http" href="http://www.scipy.org/Weave">Weave</a> and <span class="anchor" id="line-26"></span><a class="http" href="http://pyinline.sourceforge.net/">PyInline</a>, which can dramatically improve <span class="anchor" id="line-27"></span>your application's performance by making it easier to push <span class="anchor" id="line-28"></span>performance-critical code into C or machine language. <span class="anchor" id="line-29"></span><span class="anchor" id="line-30"></span><p class="line867">
<h2 id="Other_Versions">Other Versions</h2>
<span class="anchor" id="line-31"></span><span class="anchor" id="line-32"></span><ul><li><p class="line862">Russian: <a class="http" href="http://omsk.lug.ru/wacko/PythonHacking/PerfomanceTips">http://omsk.lug.ru/wacko/PythonHacking/PerfomanceTips</a> <span class="anchor" id="line-33"></span><span class="anchor" id="line-34"></span></li></ul><p class="line867">
<h2 id="Overview:_Optimize_what_needs_optimizing">Overview: Optimize what needs optimizing</h2>
<span class="anchor" id="line-35"></span><span class="anchor" id="line-36"></span><p class="line862">You can only know what makes your program slow after first getting the program to give correct results, then running it to see if the correct program is slow. <br>
<span class="anchor" id="line-37"></span>When found to be slow, profiling can show what parts of the program are consuming most of the time. A comprehensive but quick-to-run test suite can then ensure that future optimizations don't change the correctness of your program. <br>
In short: <span class="anchor" id="line-38"></span><ol type="1"><li>Get it right. <span class="anchor" id="line-39"></span></li><li>Test it's right. <span class="anchor" id="line-40"></span></li><li>Profile if slow. <span class="anchor" id="line-41"></span></li><li>Optimise. <span class="anchor" id="line-42"></span></li><li>Repeat from 2. <span class="anchor" id="line-43"></span><span class="anchor" id="line-44"></span></li></ol><p class="line874">Certain optimizations amount to good programming style and so should be learned as you learn the language. An example would be moving the calculation of values that don't change within a loop, outside of the loop. <span class="anchor" id="line-45"></span><span class="anchor" id="line-46"></span><p class="line867">
<h2 id="Choose_the_Right_Data_Structure">Choose the Right Data Structure</h2>
<span class="anchor" id="line-47"></span><span class="anchor" id="line-48"></span><p class="line874">TBD. <span class="anchor" id="line-49"></span><span class="anchor" id="line-50"></span><span class="anchor" id="line-51"></span><p class="line867">
<h2 id="Sorting">Sorting</h2>
<span class="anchor" id="line-52"></span><span class="anchor" id="line-53"></span><p class="line874">Sorting lists of basic Python objects is generally pretty efficient. The sort method for lists takes an optional comparison function as an <span class="anchor" id="line-54"></span>argument that can be used to change the sorting behavior. This is quite convenient, though it can significantly slow down your sorts, as the comparison function will be called many times. In Python 2.4, you should use the key argument to the built-in sort instead, which should be the fastest way to sort. <span class="anchor" id="line-55"></span><span class="anchor" id="line-56"></span><p class="line874">Only if you are using older versions of Python (before 2.4) does the following advice from Guido van Rossum apply: <span class="anchor" id="line-57"></span><span class="anchor" id="line-58"></span><p class="line874">An alternative way to speed up sorts is to construct a list of tuples whose <span class="anchor" id="line-59"></span>first element is a sort key that will sort properly using the default <span class="anchor" id="line-60"></span>comparison, and whose second element is the original list element. This is <span class="anchor" id="line-61"></span>the so-called <span class="anchor" id="line-62"></span><a class="http" href="http://www.google.com/search?q=Schwartzian+Transform">Schwartzian Transform</a>, <span class="anchor" id="line-63"></span>also known as <a class="nonexistent" href="/moin/DecorateSortUndecorate">DecorateSortUndecorate</a> (DSU). <span class="anchor" id="line-64"></span><span class="anchor" id="line-65"></span><p class="line874">Suppose, for example, you have a list of tuples that you want to sort by <span class="anchor" id="line-66"></span>the n-th field of each tuple. The following function will do that. <span class="anchor" id="line-67"></span><span class="anchor" id="line-68"></span><p class="line867"><span class="anchor" id="line-69"></span><span class="anchor" id="line-70"></span><span class="anchor" id="line-71"></span><span class="anchor" id="line-72"></span><span class="anchor" id="line-73"></span><span class="anchor" id="line-1-2"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-4c95aa1197e11374ef322d77445b2f2c7e577c5c" lang="en"><span class="line"><span class="anchor" id="line-1-3"></span><span class="ResWord">def</span> <span class="ID">sortby</span>(<span class="ID">somelist</span>, <span class="ID">n</span>):</span>
<span class="line"><span class="anchor" id="line-2-2"></span> <span class="ID">nlist</span> = [(<span class="ID">x</span>[<span class="ID">n</span>], <span class="ID">x</span>) <span class="ResWord">for</span> <span class="ID">x</span> <span class="ResWord">in</span> <span class="ID">somelist</span>]</span>
<span class="line"><span class="anchor" id="line-3-2"></span> <span class="ID">nlist</span>.<span class="ID">sort</span>()</span>
<span class="line"><span class="anchor" id="line-4-1"></span> <span class="ResWord">return</span> [<span class="ID">val</span> <span class="ResWord">for</span> (<span class="ID">key</span>, <span class="ID">val</span>) <span class="ResWord">in</span> <span class="ID">nlist</span>]</span>
</pre></div></div><span class="anchor" id="line-74"></span><span class="anchor" id="line-75"></span><p class="line874">Matching the behavior of the current list sort method (sorting in place) <span class="anchor" id="line-76"></span>is easily achieved as well: <span class="anchor" id="line-77"></span><span class="anchor" id="line-78"></span><p class="line867"><span class="anchor" id="line-79"></span><span class="anchor" id="line-80"></span><span class="anchor" id="line-81"></span><span class="anchor" id="line-82"></span><span class="anchor" id="line-83"></span><span class="anchor" id="line-84"></span><span class="anchor" id="line-1-4"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-b5d438b0fc57a28d7487119740102f5f864d3f47" lang="en"><span class="line"><span class="anchor" id="line-1-5"></span><span class="ResWord">def</span> <span class="ID">sortby_inplace</span>(<span class="ID">somelist</span>, <span class="ID">n</span>):</span>
<span class="line"><span class="anchor" id="line-2-3"></span> <span class="ID">somelist</span>[:] = [(<span class="ID">x</span>[<span class="ID">n</span>], <span class="ID">x</span>) <span class="ResWord">for</span> <span class="ID">x</span> <span class="ResWord">in</span> <span class="ID">somelist</span>]</span>
<span class="line"><span class="anchor" id="line-3-3"></span> <span class="ID">somelist</span>.<span class="ID">sort</span>()</span>
<span class="line"><span class="anchor" id="line-4-2"></span> <span class="ID">somelist</span>[:] = [<span class="ID">val</span> <span class="ResWord">for</span> (<span class="ID">key</span>, <span class="ID">val</span>) <span class="ResWord">in</span> <span class="ID">somelist</span>]</span>
<span class="line"><span class="anchor" id="line-5-1"></span> <span class="ResWord">return</span></span>
</pre></div></div><span class="anchor" id="line-85"></span><span class="anchor" id="line-86"></span><p class="line874">Here's an example use: <span class="anchor" id="line-87"></span><span class="anchor" id="line-88"></span><p class="line867"><span class="anchor" id="line-89"></span><span class="anchor" id="line-90"></span><span class="anchor" id="line-91"></span><span class="anchor" id="line-92"></span><span class="anchor" id="line-93"></span><span class="anchor" id="line-94"></span><span class="anchor" id="line-95"></span><span class="anchor" id="line-96"></span><span class="anchor" id="line-97"></span><span class="anchor" id="line-98"></span><span class="anchor" id="line-99"></span><span class="anchor" id="line-100"></span><span class="anchor" id="line-101"></span><span class="anchor" id="line-1-6"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-9beb922c6d0be91b76fb9d6d96aec045dffc5787" lang="en"><span class="line"><span class="anchor" id="line-1-7"></span>&gt;&gt;&gt; <span class="ID">somelist</span> = [(<span class="Number">1</span>, <span class="Number">2</span>, <span class="String">'</span><span class="String">def</span><span class="String">'</span>), (<span class="Number">2</span>, -<span class="Number">4</span>, <span class="String">'</span><span class="String">ghi</span><span class="String">'</span>), (<span class="Number">3</span>, <span class="Number">6</span>, <span class="String">'</span><span class="String">abc</span><span class="String">'</span>)]</span>
<span class="line"><span class="anchor" id="line-2-4"></span>&gt;&gt;&gt; <span class="ID">somelist</span>.<span class="ID">sort</span>()</span>
<span class="line"><span class="anchor" id="line-3-4"></span>&gt;&gt;&gt; <span class="ID">somelist</span></span>
<span class="line"><span class="anchor" id="line-4-3"></span>[(<span class="Number">1</span>, <span class="Number">2</span>, <span class="String">'</span><span class="String">def</span><span class="String">'</span>), (<span class="Number">2</span>, -<span class="Number">4</span>, <span class="String">'</span><span class="String">ghi</span><span class="String">'</span>), (<span class="Number">3</span>, <span class="Number">6</span>, <span class="String">'</span><span class="String">abc</span><span class="String">'</span>)]</span>
<span class="line"><span class="anchor" id="line-5-2"></span>&gt;&gt;&gt; <span class="ID">nlist</span> = <span class="ID">sortby</span>(<span class="ID">somelist</span>, <span class="Number">2</span>)</span>
<span class="line"><span class="anchor" id="line-6-1"></span>&gt;&gt;&gt; <span class="ID">sortby_inplace</span>(<span class="ID">somelist</span>, <span class="Number">2</span>)</span>
<span class="line"><span class="anchor" id="line-7-1"></span>&gt;&gt;&gt; <span class="ID">nlist</span> == <span class="ID">somelist</span></span>
<span class="line"><span class="anchor" id="line-8-1"></span><span class="ResWord">True</span></span>
<span class="line"><span class="anchor" id="line-9-1"></span>&gt;&gt;&gt; <span class="ID">nlist</span> = <span class="ID">sortby</span>(<span class="ID">somelist</span>, <span class="Number">1</span>)</span>
<span class="line"><span class="anchor" id="line-10-1"></span>&gt;&gt;&gt; <span class="ID">sortby_inplace</span>(<span class="ID">somelist</span>, <span class="Number">1</span>)</span>
<span class="line"><span class="anchor" id="line-11-1"></span>&gt;&gt;&gt; <span class="ID">nlist</span> == <span class="ID">somelist</span></span>
<span class="line"><span class="anchor" id="line-12-1"></span><span class="ResWord">True</span></span>
</pre></div></div><span class="anchor" id="line-102"></span><span class="anchor" id="line-103"></span><p class="line874">From Tim Delaney <span class="anchor" id="line-104"></span><span class="anchor" id="line-105"></span><p class="line874">From Python 2.3 sort is guaranteed to be stable. <span class="anchor" id="line-106"></span><span class="anchor" id="line-107"></span><p class="line874">(to be precise, it's stable in CPython 2.3, and guaranteed to be stable in Python 2.4) <span class="anchor" id="line-108"></span><span class="anchor" id="line-109"></span><p class="line874">Python 2.4 adds an optional key parameter which makes the transform a lot easier to use: <span class="anchor" id="line-110"></span><span class="anchor" id="line-111"></span><p class="line867"><span class="anchor" id="line-112"></span><span class="anchor" id="line-113"></span><span class="anchor" id="line-114"></span><span class="anchor" id="line-115"></span><span class="anchor" id="line-116"></span><span class="anchor" id="line-117"></span><span class="anchor" id="line-118"></span><span class="anchor" id="line-1-8"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-53254b613e21c50532843c9f83967228f3dc07f7" lang="en"><span class="line"><span class="anchor" id="line-1-9"></span><span class="Comment"># E.g. n = 1</span></span>
<span class="line"><span class="anchor" id="line-2-5"></span><span class="ID">n</span> = <span class="Number">1</span></span>
<span class="line"><span class="anchor" id="line-3-5"></span><span class="ResWord">import</span> <span class="ID">operator</span></span>
<span class="line"><span class="anchor" id="line-4-4"></span><span class="ID">nlist</span>.<span class="ID">sort</span>(<span class="ID">key</span>=<span class="ID">operator</span>.<span class="ID">itemgetter</span>(<span class="ID">n</span>))</span>
<span class="line"><span class="anchor" id="line-5-3"></span><span class="Comment"># use sorted() if you don't want to sort in-place:</span></span>
<span class="line"><span class="anchor" id="line-6-2"></span><span class="Comment"># sortedlist = sorted(nlist, key=operator.itemgetter(n))</span></span>
</pre></div></div><span class="anchor" id="line-119"></span><span class="anchor" id="line-120"></span><p class="line874">Note that the original item is never used for sorting, only the returned key - this is equivalent to doing: <span class="anchor" id="line-121"></span><span class="anchor" id="line-122"></span><p class="line867"><span class="anchor" id="line-123"></span><span class="anchor" id="line-124"></span><span class="anchor" id="line-125"></span><span class="anchor" id="line-126"></span><span class="anchor" id="line-127"></span><span class="anchor" id="line-128"></span><span class="anchor" id="line-1-10"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-65ad7cc5b542286e1cfed57d83149b46764e0849" lang="en"><span class="line"><span class="anchor" id="line-1-11"></span><span class="Comment"># E.g. n = 1</span></span>
<span class="line"><span class="anchor" id="line-2-6"></span><span class="ID">n</span> = <span class="Number">1</span></span>
<span class="line"><span class="anchor" id="line-3-6"></span><span class="ID">nlist</span> = [(<span class="ID">x</span>[<span class="ID">n</span>], <span class="ID">i</span>, <span class="ID">x</span>) <span class="ResWord">for</span> (<span class="ID">i</span>, <span class="ID">x</span>) <span class="ResWord">in</span> <span class="ResWord">enumerate</span>(<span class="ID">nlist</span>)]</span>
<span class="line"><span class="anchor" id="line-4-5"></span><span class="ID">nlist</span>.<span class="ID">sort</span>()</span>
<span class="line"><span class="anchor" id="line-5-4"></span><span class="ID">nlist</span> = [<span class="ID">val</span> <span class="ResWord">for</span> (<span class="ID">key</span>, <span class="ID">index</span>, <span class="ID">val</span>) <span class="ResWord">in</span> <span class="ID">nlist</span>]</span>
</pre></div></div><span class="anchor" id="line-129"></span><span class="anchor" id="line-130"></span><span class="anchor" id="line-131"></span><p class="line867">
<h2 id="String_Concatenation">String Concatenation</h2>
<span class="anchor" id="line-132"></span><span class="anchor" id="line-133"></span><p class="line867"><span class="anchor" id="line-134"></span><span class="anchor" id="line-135"></span><span class="anchor" id="line-136"></span><div class="note"><span class="anchor" id="line-1-12"></span><p class="line874">The accuracy of this section is disputed with respect to later versions of Python. In CPython 2.5, string concatenation is fairly fast, although this may not apply likewise to other Python implementations. <span class="anchor" id="line-2-7"></span>See <a href="/moin/ConcatenationTestCode">ConcatenationTestCode</a> for a discussion. </div><span class="anchor" id="line-137"></span><span class="anchor" id="line-138"></span><p class="line874">Strings in Python are immutable. This fact frequently sneaks up and <span class="anchor" id="line-139"></span>bites novice Python programmers on the rump. Immutability confers some <span class="anchor" id="line-140"></span>advantages and disadvantages. In the plus column, strings can be used as <span class="anchor" id="line-141"></span>keys in dictionaries and individual copies can be shared among multiple <span class="anchor" id="line-142"></span>variable bindings. (Python automatically shares one- and two-character <span class="anchor" id="line-143"></span>strings.) In the minus column, you can't say something like, "change all <span class="anchor" id="line-144"></span>the 'a's to 'b's" in any given string. Instead, you have to create a new <span class="anchor" id="line-145"></span>string with the desired properties. This continual copying can lead to <span class="anchor" id="line-146"></span>significant inefficiencies in Python programs. <span class="anchor" id="line-147"></span><span class="anchor" id="line-148"></span><p class="line874">Avoid this: <span class="anchor" id="line-149"></span><span class="anchor" id="line-150"></span><p class="line867"><span class="anchor" id="line-151"></span><span class="anchor" id="line-152"></span><span class="anchor" id="line-153"></span><span class="anchor" id="line-154"></span><span class="anchor" id="line-1-13"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-853541f673f564baa7b8be8460a460586cede84b" lang="en"><span class="line"><span class="anchor" id="line-1-14"></span><span class="ID">s</span> = <span class="String">"</span><span class="String">"</span></span>
<span class="line"><span class="anchor" id="line-2-8"></span><span class="ResWord">for</span> <span class="ID">substring</span> <span class="ResWord">in</span> <span class="ResWord">list</span>:</span>
<span class="line"><span class="anchor" id="line-3-7"></span> <span class="ID">s</span> += <span class="ID">substring</span></span>
</pre></div></div><span class="anchor" id="line-155"></span><span class="anchor" id="line-156"></span><p class="line862">Use <tt>s&nbsp;=&nbsp;"".join(list)</tt> instead. The former is a very common and <span class="anchor" id="line-157"></span>catastrophic mistake when building large strings. Similarly, if you are <span class="anchor" id="line-158"></span>generating bits of a string sequentially instead of: <span class="anchor" id="line-159"></span><span class="anchor" id="line-160"></span><p class="line867"><span class="anchor" id="line-161"></span><span class="anchor" id="line-162"></span><span class="anchor" id="line-163"></span><span class="anchor" id="line-164"></span><span class="anchor" id="line-1-15"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-7b888acbf50bef6c770ceb1337934c97c654da8b" lang="en"><span class="line"><span class="anchor" id="line-1-16"></span><span class="ID">s</span> = <span class="String">"</span><span class="String">"</span></span>
<span class="line"><span class="anchor" id="line-2-9"></span><span class="ResWord">for</span> <span class="ID">x</span> <span class="ResWord">in</span> <span class="ResWord">list</span>:</span>
<span class="line"><span class="anchor" id="line-3-8"></span> <span class="ID">s</span> += <span class="ID">some_function</span>(<span class="ID">x</span>)</span>
</pre></div></div><span class="anchor" id="line-165"></span><span class="anchor" id="line-166"></span><p class="line874">use <span class="anchor" id="line-167"></span><span class="anchor" id="line-168"></span><p class="line867"><span class="anchor" id="line-169"></span><span class="anchor" id="line-170"></span><span class="anchor" id="line-171"></span><span class="anchor" id="line-1-17"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-93fd2cbb1452d458a946f9fc4cd7937b7406fd95" lang="en"><span class="line"><span class="anchor" id="line-1-18"></span><span class="ID">slist</span> = [<span class="ID">some_function</span>(<span class="ID">elt</span>) <span class="ResWord">for</span> <span class="ID">elt</span> <span class="ResWord">in</span> <span class="ID">somelist</span>]</span>
<span class="line"><span class="anchor" id="line-2-10"></span><span class="ID">s</span> = <span class="String">"</span><span class="String">"</span>.<span class="ID">join</span>(<span class="ID">slist</span>)</span>
</pre></div></div><span class="anchor" id="line-172"></span><span class="anchor" id="line-173"></span><p class="line874">Avoid: <span class="anchor" id="line-174"></span><span class="anchor" id="line-175"></span><p class="line867"><span class="anchor" id="line-176"></span><span class="anchor" id="line-177"></span><span class="anchor" id="line-1-19"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-413eefcbf1ed10ef3861ff0e3f219f28a4611705" lang="en"><span class="line"><span class="anchor" id="line-1-20"></span><span class="ID">out</span> = <span class="String">"</span><span class="String">&lt;html&gt;</span><span class="String">"</span> + <span class="ID">head</span> + <span class="ID">prologue</span> + <span class="ID">query</span> + <span class="ID">tail</span> + <span class="String">"</span><span class="String">&lt;/html&gt;</span><span class="String">"</span></span>
</pre></div></div><span class="anchor" id="line-178"></span><span class="anchor" id="line-179"></span><p class="line874">Instead, use <span class="anchor" id="line-180"></span><span class="anchor" id="line-181"></span><p class="line867"><span class="anchor" id="line-182"></span><span class="anchor" id="line-183"></span><span class="anchor" id="line-1-21"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-f70787930aa0cafc70b1b8d8a60e8e6c3c82e7ec" lang="en"><span class="line"><span class="anchor" id="line-1-22"></span><span class="ID">out</span> = <span class="String">"</span><span class="String">&lt;html&gt;</span><span class="String">%s</span><span class="String">%s</span><span class="String">%s</span><span class="String">%s</span><span class="String">&lt;/html&gt;</span><span class="String">"</span> % (<span class="ID">head</span>, <span class="ID">prologue</span>, <span class="ID">query</span>, <span class="ID">tail</span>)</span>
</pre></div></div><span class="anchor" id="line-184"></span><span class="anchor" id="line-185"></span><p class="line874">Even better, for readability (this has nothing to do with efficiency <span class="anchor" id="line-186"></span>other than yours as a programmer), use dictionary substitution: <span class="anchor" id="line-187"></span><span class="anchor" id="line-188"></span><p class="line867"><span class="anchor" id="line-189"></span><span class="anchor" id="line-190"></span><span class="anchor" id="line-1-23"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-ada9ec517b990116f43b207b452d7f924d81c609" lang="en"><span class="line"><span class="anchor" id="line-1-24"></span><span class="ID">out</span> = <span class="String">"</span><span class="String">&lt;html&gt;</span><span class="String">%(head)s</span><span class="String">%(prologue)s</span><span class="String">%(query)s</span><span class="String">%(tail)s</span><span class="String">&lt;/html&gt;</span><span class="String">"</span> % <span class="ResWord">locals</span>()</span>
</pre></div></div><span class="anchor" id="line-191"></span><span class="anchor" id="line-192"></span><p class="line874">This last two are going to be much faster, especially when piled up over <span class="anchor" id="line-193"></span>many CGI script executions, and easier to modify to boot. In addition, <span class="anchor" id="line-194"></span>the slow way of doing things got slower in Python 2.0 with the addition <span class="anchor" id="line-195"></span>of rich comparisons to the language. It now takes the Python virtual <span class="anchor" id="line-196"></span>machine a lot longer to figure out how to concatenate two strings. <span class="anchor" id="line-197"></span>(Don't forget that Python does all method lookup at runtime.) <span class="anchor" id="line-198"></span><span class="anchor" id="line-199"></span><span class="anchor" id="line-200"></span><p class="line867">
<h2 id="Loops">Loops</h2>
<span class="anchor" id="line-201"></span><span class="anchor" id="line-202"></span><p class="line862">Python supports a couple of looping constructs. The <tt>for</tt> statement is <span class="anchor" id="line-203"></span>most commonly used. It loops over the elements of a sequence, assigning each <span class="anchor" id="line-204"></span>to the loop variable. If the body of your loop is simple, the interpreter <span class="anchor" id="line-205"></span>overhead of the <tt>for</tt> loop itself can be a substantial amount of the <span class="anchor" id="line-206"></span>overhead. This is where the <span class="anchor" id="line-207"></span><a class="http" href="http://www.python.org/doc/lib/built-in-funcs.html">map</a> function is handy. <span class="anchor" id="line-208"></span>You can think of <tt>map</tt> as a <tt>for</tt> moved into C code. The only <span class="anchor" id="line-209"></span>restriction is that the "loop body" of <tt>map</tt> must be a function call. Besides the syntactic benefit of list comprehensions, they are often as fast or faster than equivalent use of <tt>map</tt>. <span class="anchor" id="line-210"></span><span class="anchor" id="line-211"></span><p class="line874">Here's a straightforward example. Instead of looping over a list of <span class="anchor" id="line-212"></span>words and converting them to upper case: <span class="anchor" id="line-213"></span><span class="anchor" id="line-214"></span><p class="line867"><span class="anchor" id="line-215"></span><span class="anchor" id="line-216"></span><span class="anchor" id="line-217"></span><span class="anchor" id="line-218"></span><span class="anchor" id="line-1-25"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-b46dcb4de03cd808434a751c2897204eef9e7875" lang="en"><span class="line"><span class="anchor" id="line-1-26"></span><span class="ID">newlist</span> = []</span>
<span class="line"><span class="anchor" id="line-2-11"></span><span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">oldlist</span>:</span>
<span class="line"><span class="anchor" id="line-3-9"></span> <span class="ID">newlist</span>.<span class="ID">append</span>(<span class="ID">word</span>.<span class="ID">upper</span>())</span>
</pre></div></div><span class="anchor" id="line-219"></span><span class="anchor" id="line-220"></span><p class="line862">you can use <tt>map</tt> to push the loop from the interpreter into compiled C <span class="anchor" id="line-221"></span>code: <span class="anchor" id="line-222"></span><span class="anchor" id="line-223"></span><p class="line867"><span class="anchor" id="line-224"></span><span class="anchor" id="line-225"></span><span class="anchor" id="line-1-27"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-577b8b589bf0805f03d2e5e5a1ab66c87163a56a" lang="en"><span class="line"><span class="anchor" id="line-1-28"></span><span class="ID">newlist</span> = <span class="ResWord">map</span>(<span class="ResWord">str</span>.<span class="ID">upper</span>, <span class="ID">oldlist</span>)</span>
</pre></div></div><span class="anchor" id="line-226"></span><span class="anchor" id="line-227"></span><p class="line874">List comprehensions were added to Python in version 2.0 as well. They <span class="anchor" id="line-228"></span>provide a syntactically more compact and more efficient way of writing the above for loop: <span class="anchor" id="line-229"></span><span class="anchor" id="line-230"></span><p class="line867"><span class="anchor" id="line-231"></span><span class="anchor" id="line-232"></span><span class="anchor" id="line-1-29"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-efd0e2d8af2481d190ee1e0e6f2767c2b6f1c5fd" lang="en"><span class="line"><span class="anchor" id="line-1-30"></span><span class="ID">newlist</span> = [<span class="ID">s</span>.<span class="ID">upper</span>() <span class="ResWord">for</span> <span class="ID">s</span> <span class="ResWord">in</span> <span class="ID">oldlist</span>]</span>
</pre></div></div><span class="anchor" id="line-233"></span><span class="anchor" id="line-234"></span><p class="line874">Generator expressions were added to Python in version 2.4. They function <span class="anchor" id="line-235"></span>more-or-less like list comprehensions or <tt>map</tt> but avoid the overhead of <span class="anchor" id="line-236"></span>generating the entire list at once. Instead, they return a generator object <span class="anchor" id="line-237"></span>which can be iterated over bit-by-bit: <span class="anchor" id="line-238"></span><span class="anchor" id="line-239"></span><p class="line867"><span class="anchor" id="line-240"></span><span class="anchor" id="line-241"></span><span class="anchor" id="line-1-31"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-f7deb6aaca1a007f34a0493df72e24a8615feaf7" lang="en"><span class="line"><span class="anchor" id="line-1-32"></span><span class="ID">iterator</span> = (<span class="ID">s</span>.<span class="ID">upper</span>() <span class="ResWord">for</span> <span class="ID">s</span> <span class="ResWord">in</span> <span class="ID">oldlist</span>)</span>
</pre></div></div><span class="anchor" id="line-242"></span><span class="anchor" id="line-243"></span><p class="line874">Which method is appropriate will depend on what version of Python you're <span class="anchor" id="line-244"></span>using and the characteristics of the data you are manipulating. <span class="anchor" id="line-245"></span><span class="anchor" id="line-246"></span><p class="line862">Guido van Rossum wrote a much more detailed (and succinct) examination of <a class="http" href="http://www.python.org/doc/essays/list2str.html">loop optimization</a> that is <span class="anchor" id="line-247"></span>definitely worth reading. <span class="anchor" id="line-248"></span><span class="anchor" id="line-249"></span><span class="anchor" id="line-250"></span><p class="line867">
<h2 id="Avoiding_dots...">Avoiding dots...</h2>
<span class="anchor" id="line-251"></span><span class="anchor" id="line-252"></span><p class="line862">Suppose you can't use <tt>map</tt> or a list comprehension? You may be stuck <span class="anchor" id="line-253"></span>with the for loop. The for loop example has another inefficiency. Both <span class="anchor" id="line-254"></span><tt>newlist.append</tt> and <tt>word.upper</tt> are function references that are <span class="anchor" id="line-255"></span>reevaluated each time through the loop. The original loop can be <span class="anchor" id="line-256"></span>replaced with: <span class="anchor" id="line-257"></span><span class="anchor" id="line-258"></span><p class="line867"><span class="anchor" id="line-259"></span><span class="anchor" id="line-260"></span><span class="anchor" id="line-261"></span><span class="anchor" id="line-262"></span><span class="anchor" id="line-263"></span><span class="anchor" id="line-264"></span><span class="anchor" id="line-1-33"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-4b1f09cada6b76847697465d77fd0decaa8cd8aa" lang="en"><span class="line"><span class="anchor" id="line-1-34"></span><span class="ID">upper</span> = <span class="ResWord">str</span>.<span class="ID">upper</span></span>
<span class="line"><span class="anchor" id="line-2-12"></span><span class="ID">newlist</span> = []</span>
<span class="line"><span class="anchor" id="line-3-10"></span><span class="ID">append</span> = <span class="ID">newlist</span>.<span class="ID">append</span></span>
<span class="line"><span class="anchor" id="line-4-6"></span><span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">oldlist</span>:</span>
<span class="line"><span class="anchor" id="line-5-5"></span> <span class="ID">append</span>(<span class="ID">upper</span>(<span class="ID">word</span>))</span>
</pre></div></div><span class="anchor" id="line-265"></span><span class="anchor" id="line-266"></span><p class="line874">This technique should be used with caution. It gets more difficult to <span class="anchor" id="line-267"></span>maintain if the loop is large. Unless you are intimately familiar with <span class="anchor" id="line-268"></span>that piece of code you will find yourself scanning up to check the <span class="anchor" id="line-269"></span>definitions of <tt>append</tt> and <tt>upper</tt>. <span class="anchor" id="line-270"></span><span class="anchor" id="line-271"></span><span class="anchor" id="line-272"></span><p class="line867">
<h2 id="Local_Variables">Local Variables</h2>
<span class="anchor" id="line-273"></span><span class="anchor" id="line-274"></span><p class="line862">The final speedup available to us for the non-<tt>map</tt> version of the <tt>for</tt> <span class="anchor" id="line-275"></span>loop is to use local variables wherever possible. If the above loop is <span class="anchor" id="line-276"></span>cast as a function, <tt>append</tt> and <tt>upper</tt> become local variables. Python <span class="anchor" id="line-277"></span>accesses local variables much more efficiently than global variables. <span class="anchor" id="line-278"></span><span class="anchor" id="line-279"></span><p class="line867"><span class="anchor" id="line-280"></span><span class="anchor" id="line-281"></span><span class="anchor" id="line-282"></span><span class="anchor" id="line-283"></span><span class="anchor" id="line-284"></span><span class="anchor" id="line-285"></span><span class="anchor" id="line-286"></span><span class="anchor" id="line-287"></span><span class="anchor" id="line-1-35"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-ba58521d292078c1bb1a23f62b3938bd529a7036" lang="en"><span class="line"><span class="anchor" id="line-1-36"></span><span class="ResWord">def</span> <span class="ID">func</span>():</span>
<span class="line"><span class="anchor" id="line-2-13"></span> <span class="ID">upper</span> = <span class="ResWord">str</span>.<span class="ID">upper</span></span>
<span class="line"><span class="anchor" id="line-3-11"></span> <span class="ID">newlist</span> = []</span>
<span class="line"><span class="anchor" id="line-4-7"></span> <span class="ID">append</span> = <span class="ID">newlist</span>.<span class="ID">append</span></span>
<span class="line"><span class="anchor" id="line-5-6"></span> <span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">oldlist</span>:</span>
<span class="line"><span class="anchor" id="line-6-3"></span> <span class="ID">append</span>(<span class="ID">upper</span>(<span class="ID">word</span>))</span>
<span class="line"><span class="anchor" id="line-7-2"></span> <span class="ResWord">return</span> <span class="ID">newlist</span></span>
</pre></div></div><span class="anchor" id="line-288"></span><span class="anchor" id="line-289"></span><p class="line874">At the time I originally wrote this I was using a 100MHz Pentium running <span class="anchor" id="line-290"></span>BSDI. I got the following times for converting the list of words in <span class="anchor" id="line-291"></span><tt>/usr/share/dict/words</tt> (38,470 words at that time) to upper case: <span class="anchor" id="line-292"></span><span class="anchor" id="line-293"></span><p class="line867"><span class="anchor" id="line-294"></span><span class="anchor" id="line-295"></span><span class="anchor" id="line-296"></span><span class="anchor" id="line-297"></span><span class="anchor" id="line-298"></span><span class="anchor" id="line-299"></span><pre><span class="anchor" id="line-1"></span>Version Time (seconds)
<span class="anchor" id="line-2"></span>Basic loop 3.47
<span class="anchor" id="line-3"></span>Eliminate dots 2.45
<span class="anchor" id="line-4"></span>Local variable &amp; no dots 1.79
<span class="anchor" id="line-5"></span>Using map function 0.54</pre><span class="anchor" id="line-300"></span><span class="anchor" id="line-301"></span><span class="anchor" id="line-302"></span><p class="line867">
<h2 id="Initializing_Dictionary_Elements">Initializing Dictionary Elements</h2>
<span class="anchor" id="line-303"></span><span class="anchor" id="line-304"></span><p class="line874">Suppose you are building a dictionary of word frequencies and you've <span class="anchor" id="line-305"></span>already broken your text up into a list of words. You might execute <span class="anchor" id="line-306"></span>something like: <span class="anchor" id="line-307"></span><span class="anchor" id="line-308"></span><p class="line867"><span class="anchor" id="line-309"></span><span class="anchor" id="line-310"></span><span class="anchor" id="line-311"></span><span class="anchor" id="line-312"></span><span class="anchor" id="line-313"></span><span class="anchor" id="line-314"></span><span class="anchor" id="line-1-37"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-a13fbecb7479789924417ca46f6efaaa5950d1c0" lang="en"><span class="line"><span class="anchor" id="line-1-38"></span><span class="ID">wdict</span> = {}</span>
<span class="line"><span class="anchor" id="line-2-14"></span><span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">words</span>:</span>
<span class="line"><span class="anchor" id="line-3-12"></span> <span class="ResWord">if</span> <span class="ID">word</span> <span class="ResWord">not</span> <span class="ResWord">in</span> <span class="ID">wdict</span>:</span>
<span class="line"><span class="anchor" id="line-4-8"></span> <span class="ID">wdict</span>[<span class="ID">word</span>] = <span class="Number">0</span></span>
<span class="line"><span class="anchor" id="line-5-7"></span> <span class="ID">wdict</span>[<span class="ID">word</span>] += <span class="Number">1</span></span>
</pre></div></div><span class="anchor" id="line-315"></span><span class="anchor" id="line-316"></span><p class="line862">Except for the first time, each time a word is seen the <tt>if</tt> statement's <span class="anchor" id="line-317"></span>test fails. If you are counting a large number of words, many will <span class="anchor" id="line-318"></span>probably occur multiple times. In a situation where the initialization <span class="anchor" id="line-319"></span>of a value is only going to occur once and the augmentation of that <span class="anchor" id="line-320"></span>value will occur many times it is cheaper to use a <tt>try</tt> statement: <span class="anchor" id="line-321"></span><span class="anchor" id="line-322"></span><p class="line867"><span class="anchor" id="line-323"></span><span class="anchor" id="line-324"></span><span class="anchor" id="line-325"></span><span class="anchor" id="line-326"></span><span class="anchor" id="line-327"></span><span class="anchor" id="line-328"></span><span class="anchor" id="line-329"></span><span class="anchor" id="line-1-39"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-81601f5364edc14222d0d9010b492aa2efeb7ebd" lang="en"><span class="line"><span class="anchor" id="line-1-40"></span><span class="ID">wdict</span> = {}</span>
<span class="line"><span class="anchor" id="line-2-15"></span><span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">words</span>:</span>
<span class="line"><span class="anchor" id="line-3-13"></span> <span class="ResWord">try</span>:</span>
<span class="line"><span class="anchor" id="line-4-9"></span> <span class="ID">wdict</span>[<span class="ID">word</span>] += <span class="Number">1</span></span>
<span class="line"><span class="anchor" id="line-5-8"></span> <span class="ResWord">except</span> <span class="ID">KeyError</span>:</span>
<span class="line"><span class="anchor" id="line-6-4"></span> <span class="ID">wdict</span>[<span class="ID">word</span>] = <span class="Number">1</span></span>
</pre></div></div><span class="anchor" id="line-330"></span><span class="anchor" id="line-331"></span><p class="line862">It's important to catch the expected <a href="/moin/KeyError">KeyError</a> exception, and not have a <span class="anchor" id="line-332"></span>default <tt>except</tt> clause to avoid trying to recover from an exception you <span class="anchor" id="line-333"></span>really can't handle by the statement(s) in the <tt>try</tt> clause. <span class="anchor" id="line-334"></span><span class="anchor" id="line-335"></span><p class="line874">A third alternative became available with the release of Python 2.x. <span class="anchor" id="line-336"></span>Dictionaries now have a get() method which will return a default value <span class="anchor" id="line-337"></span>if the desired key isn't found in the dictionary. This simplifies the loop: <span class="anchor" id="line-338"></span><span class="anchor" id="line-339"></span><p class="line867"><span class="anchor" id="line-340"></span><span class="anchor" id="line-341"></span><span class="anchor" id="line-342"></span><span class="anchor" id="line-343"></span><span class="anchor" id="line-344"></span><span class="anchor" id="line-1-41"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-c5935756bc5f2a1a44a923cd62d434e71c87a92b" lang="en"><span class="line"><span class="anchor" id="line-1-42"></span><span class="ID">wdict</span> = {}</span>
<span class="line"><span class="anchor" id="line-2-16"></span><span class="ID">get</span> = <span class="ID">wdict</span>.<span class="ID">get</span></span>
<span class="line"><span class="anchor" id="line-3-14"></span><span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">words</span>:</span>
<span class="line"><span class="anchor" id="line-4-10"></span> <span class="ID">wdict</span>[<span class="ID">word</span>] = <span class="ID">get</span>(<span class="ID">word</span>, <span class="Number">0</span>) + <span class="Number">1</span></span>
</pre></div></div><span class="anchor" id="line-345"></span><span class="anchor" id="line-346"></span><p class="line874">When I originally wrote this section, there were clear situations where <span class="anchor" id="line-347"></span>one of the first two approaches was faster. It seems that all three <span class="anchor" id="line-348"></span>approaches now exhibit similar performance (within about 10% of each <span class="anchor" id="line-349"></span>other), more or less independent of the properties of the list of words. <span class="anchor" id="line-350"></span><span class="anchor" id="line-351"></span><p class="line874">Also, if the value stored in the dictionary is an object or a (mutable) list, <span class="anchor" id="line-352"></span>you could also use the <tt>dict.setdefault</tt> method, e.g. <span class="anchor" id="line-353"></span><span class="anchor" id="line-354"></span><p class="line867"><span class="anchor" id="line-355"></span><span class="anchor" id="line-356"></span><span class="anchor" id="line-1-43"></span><span class="anchor" id="line-2-17"></span><span class="anchor" id="line-3-15"></span><span class="anchor" id="line-4-11"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en">
<script type="text/javascript">
function isnumbered(obj) {
return obj.childNodes.length && obj.firstChild.childNodes.length && obj.firstChild.firstChild.className == 'LineNumber';
}
function nformat(num,chrs,add) {
var nlen = Math.max(0,chrs-(''+num).length), res = '';
while (nlen>0) { res += ' '; nlen-- }
return res+num+add;
}
function addnumber(did, nstart, nstep) {
var c = document.getElementById(did), l = c.firstChild, n = 1;
if (!isnumbered(c)) {
if (typeof nstart == 'undefined') nstart = 1;
if (typeof nstep == 'undefined') nstep = 1;
var n = nstart;
while (l != null) {
if (l.tagName == 'SPAN') {
var s = document.createElement('SPAN');
var a = document.createElement('A');
s.className = 'LineNumber';
a.appendChild(document.createTextNode(nformat(n,4,'')));
a.href = '#' + did + '_' + n;
s.appendChild(a);
s.appendChild(document.createTextNode(' '));
n += nstep;
if (l.childNodes.length) {
l.insertBefore(s, l.firstChild);
}
else {
l.appendChild(s);
}
}
l = l.nextSibling;
}
}
return false;
}
function remnumber(did) {
var c = document.getElementById(did), l = c.firstChild;
if (isnumbered(c)) {
while (l != null) {
if (l.tagName == 'SPAN' && l.firstChild.className == 'LineNumber') l.removeChild(l.firstChild);
l = l.nextSibling;
}
}
return false;
}
function togglenumber(did, nstart, nstep) {
var c = document.getElementById(did);
if (isnumbered(c)) {
remnumber(did);
} else {
addnumber(did,nstart,nstep);
}
return false;
}
</script>
<script type="text/javascript">
document.write('<a href="#" onclick="return togglenumber(\'CA-2434f12ff162b4bd4948e2241d93034169787184\', 4, 1);" \
class="codenumbers">Toggle line numbers<\/a>');
</script>
<pre dir="ltr" id="CA-2434f12ff162b4bd4948e2241d93034169787184" lang="en"><span class="line"><span class="LineNumber"><a href="#CA-2434f12ff162b4bd4948e2241d93034169787184_4"> 4</a> </span><span class="LineAnchor" id="CA-2434f12ff162b4bd4948e2241d93034169787184_4"></span><span class="anchor" id="line-1-44"></span> <span class="ID">wdict</span>.<span class="ID">setdefault</span>(<span class="ID">key</span>, []).<span class="ID">append</span>(<span class="ID">new_element</span>)</span>
</pre></div></div><span class="anchor" id="line-357"></span><span class="anchor" id="line-358"></span><p class="line874">You might think that this avoids having to look up the key twice. <span class="anchor" id="line-359"></span>It actually doesn't (even in python 3.0), but at least the double <span class="anchor" id="line-360"></span>lookup is performed in C. <span class="anchor" id="line-361"></span><span class="anchor" id="line-362"></span><p class="line862">Another option is to use the <a class="http" href="http://docs.python.org/py3k/library/collections.html#collections.defaultdict">defaultdict</a> class: <span class="anchor" id="line-363"></span><span class="anchor" id="line-364"></span><p class="line867"><span class="anchor" id="line-365"></span><span class="anchor" id="line-366"></span><span class="anchor" id="line-367"></span><span class="anchor" id="line-368"></span><span class="anchor" id="line-369"></span><span class="anchor" id="line-370"></span><span class="anchor" id="line-371"></span><span class="anchor" id="line-1-45"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-663059bb0104cd80b510d6909f41fb5a0f57b61f" lang="en"><span class="line"><span class="anchor" id="line-1-46"></span><span class="ResWord">from</span> <span class="ID">collections</span> <span class="ResWord">import</span> <span class="ID">defaultdict</span></span>
<span class="line"><span class="anchor" id="line-2-18"></span></span>
<span class="line"><span class="anchor" id="line-3-16"></span><span class="ID">wdict</span> = <span class="ID">defaultdict</span>(<span class="ResWord">int</span>)</span>
<span class="line"><span class="anchor" id="line-4-12"></span></span>
<span class="line"><span class="anchor" id="line-5-9"></span><span class="ResWord">for</span> <span class="ID">word</span> <span class="ResWord">in</span> <span class="ID">words</span>:</span>
<span class="line"><span class="anchor" id="line-6-5"></span> <span class="ID">wdict</span>[<span class="ID">word</span>] += <span class="Number">1</span></span>
</pre></div></div><span class="anchor" id="line-372"></span><span class="anchor" id="line-373"></span><p class="line867">
<h2 id="Import_Statement_Overhead">Import Statement Overhead</h2>
<span class="anchor" id="line-374"></span><span class="anchor" id="line-375"></span><p class="line867"><tt>import</tt> statements can be executed just about anywhere. It's often <span class="anchor" id="line-376"></span>useful to place them inside functions to restrict their visibility <span class="anchor" id="line-377"></span>and/or reduce initial startup time. Although Python's interpreter is <span class="anchor" id="line-378"></span>optimized to not import the same module multiple times, repeatedly <span class="anchor" id="line-379"></span>executing an import statement can seriously affect performance in some <span class="anchor" id="line-380"></span>circumstances. <span class="anchor" id="line-381"></span><span class="anchor" id="line-382"></span><p class="line862">Consider the following two snippets of code (originally from Greg <a class="nonexistent" href="/moin/McFarlane">McFarlane</a>, <span class="anchor" id="line-383"></span>I believe - I found it unattributed in a comp.lang.python <span class="anchor" id="line-384"></span><a class="mailto" href="mailto:python-list@python.org">python-list@python.org</a> posting and later attributed to him in another <span class="anchor" id="line-385"></span>source): <span class="anchor" id="line-386"></span><span class="anchor" id="line-387"></span><p class="line867"><span class="anchor" id="line-388"></span><span class="anchor" id="line-389"></span><span class="anchor" id="line-390"></span><span class="anchor" id="line-391"></span><span class="anchor" id="line-392"></span><span class="anchor" id="line-393"></span><span class="anchor" id="line-394"></span><span class="anchor" id="line-1-47"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-a874078e7136f2f65e303848cb8e6434de8288f2" lang="en"><span class="line"><span class="anchor" id="line-1-48"></span><span class="ResWord">def</span> <span class="ID">doit1</span>():</span>
<span class="line"><span class="anchor" id="line-2-19"></span> <span class="ResWord">import</span> <span class="ID">string</span> <span class="Comment">###### import statement inside function</span></span>
<span class="line"><span class="anchor" id="line-3-17"></span> <span class="ID">string</span>.<span class="ID">lower</span>(<span class="String">'</span><span class="String">Python</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-4-13"></span></span>
<span class="line"><span class="anchor" id="line-5-10"></span><span class="ResWord">for</span> <span class="ID">num</span> <span class="ResWord">in</span> <span class="ResWord">range</span>(<span class="Number">100000</span>):</span>
<span class="line"><span class="anchor" id="line-6-6"></span> <span class="ID">doit1</span>()</span>
</pre></div></div><span class="anchor" id="line-395"></span><span class="anchor" id="line-396"></span><p class="line874">or: <span class="anchor" id="line-397"></span><span class="anchor" id="line-398"></span><p class="line867"><span class="anchor" id="line-399"></span><span class="anchor" id="line-400"></span><span class="anchor" id="line-401"></span><span class="anchor" id="line-402"></span><span class="anchor" id="line-403"></span><span class="anchor" id="line-404"></span><span class="anchor" id="line-405"></span><span class="anchor" id="line-1-49"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-0e6b101d79d4e1f2ff2e05b0c3d45af3bb24d438" lang="en"><span class="line"><span class="anchor" id="line-1-50"></span><span class="ResWord">import</span> <span class="ID">string</span> <span class="Comment">###### import statement outside function</span></span>
<span class="line"><span class="anchor" id="line-2-20"></span><span class="ResWord">def</span> <span class="ID">doit2</span>():</span>
<span class="line"><span class="anchor" id="line-3-18"></span> <span class="ID">string</span>.<span class="ID">lower</span>(<span class="String">'</span><span class="String">Python</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-4-14"></span></span>
<span class="line"><span class="anchor" id="line-5-11"></span><span class="ResWord">for</span> <span class="ID">num</span> <span class="ResWord">in</span> <span class="ResWord">range</span>(<span class="Number">100000</span>):</span>
<span class="line"><span class="anchor" id="line-6-7"></span> <span class="ID">doit2</span>()</span>
</pre></div></div><span class="anchor" id="line-406"></span><span class="anchor" id="line-407"></span><p class="line867"><tt>doit2</tt> will run much faster than <tt>doit1</tt>, even though the reference <span class="anchor" id="line-408"></span>to the string module is global in <tt>doit2</tt>. Here's a Python interpreter <span class="anchor" id="line-409"></span>session run using Python 2.3 and the new <tt>timeit</tt> module, which shows how <span class="anchor" id="line-410"></span>much faster the second is than the first: <span class="anchor" id="line-411"></span><span class="anchor" id="line-412"></span><p class="line867"><span class="anchor" id="line-413"></span><span class="anchor" id="line-414"></span><span class="anchor" id="line-415"></span><span class="anchor" id="line-416"></span><span class="anchor" id="line-417"></span><span class="anchor" id="line-418"></span><span class="anchor" id="line-419"></span><span class="anchor" id="line-420"></span><span class="anchor" id="line-421"></span><span class="anchor" id="line-422"></span><span class="anchor" id="line-423"></span><span class="anchor" id="line-424"></span><span class="anchor" id="line-425"></span><span class="anchor" id="line-426"></span><span class="anchor" id="line-427"></span><span class="anchor" id="line-428"></span><span class="anchor" id="line-1-51"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-87c265b8004de847d76142dc50effcdf49796db7" lang="en"><span class="line"><span class="anchor" id="line-1-52"></span>&gt;&gt;&gt; <span class="ResWord">def</span> <span class="ID">doit1</span>():</span>
<span class="line"><span class="anchor" id="line-2-21"></span>... <span class="ResWord">import</span> <span class="ID">string</span></span>
<span class="line"><span class="anchor" id="line-3-19"></span>... <span class="ID">string</span>.<span class="ID">lower</span>(<span class="String">'</span><span class="String">Python</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-4-15"></span>...</span>
<span class="line"><span class="anchor" id="line-5-12"></span>&gt;&gt;&gt; <span class="ResWord">import</span> <span class="ID">string</span></span>
<span class="line"><span class="anchor" id="line-6-8"></span>&gt;&gt;&gt; <span class="ResWord">def</span> <span class="ID">doit2</span>():</span>
<span class="line"><span class="anchor" id="line-7-3"></span>... <span class="ID">string</span>.<span class="ID">lower</span>(<span class="String">'</span><span class="String">Python</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-8-2"></span>...</span>
<span class="line"><span class="anchor" id="line-9-2"></span>&gt;&gt;&gt; <span class="ResWord">import</span> <span class="ID">timeit</span></span>
<span class="line"><span class="anchor" id="line-10-2"></span>&gt;&gt;&gt; <span class="ID">t</span> = <span class="ID">timeit</span>.<span class="ID">Timer</span>(<span class="ID">setup</span>=<span class="String">'</span><span class="String">from __main__ import doit1</span><span class="String">'</span>, <span class="ID">stmt</span>=<span class="String">'</span><span class="String">doit1()</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-11-2"></span>&gt;&gt;&gt; <span class="ID">t</span>.<span class="ID">timeit</span>()</span>
<span class="line"><span class="anchor" id="line-12-2"></span><span class="Number">11.479144930839539</span></span>
<span class="line"><span class="anchor" id="line-13-1"></span>&gt;&gt;&gt; <span class="ID">t</span> = <span class="ID">timeit</span>.<span class="ID">Timer</span>(<span class="ID">setup</span>=<span class="String">'</span><span class="String">from __main__ import doit2</span><span class="String">'</span>, <span class="ID">stmt</span>=<span class="String">'</span><span class="String">doit2()</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-14-1"></span>&gt;&gt;&gt; <span class="ID">t</span>.<span class="ID">timeit</span>()</span>
<span class="line"><span class="anchor" id="line-15-1"></span><span class="Number">4.6661689281463623</span></span>
</pre></div></div><span class="anchor" id="line-429"></span><span class="anchor" id="line-430"></span><p class="line874">String methods were introduced to the language in Python 2.0. These <span class="anchor" id="line-431"></span>provide a version that avoids the import completely and runs even faster: <span class="anchor" id="line-432"></span><span class="anchor" id="line-433"></span><p class="line867"><span class="anchor" id="line-434"></span><span class="anchor" id="line-435"></span><span class="anchor" id="line-436"></span><span class="anchor" id="line-437"></span><span class="anchor" id="line-438"></span><span class="anchor" id="line-439"></span><span class="anchor" id="line-1-53"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-6fd66245bae98d34895f45d85a348c7feded13c6" lang="en"><span class="line"><span class="anchor" id="line-1-54"></span><span class="ResWord">def</span> <span class="ID">doit3</span>():</span>
<span class="line"><span class="anchor" id="line-2-22"></span> <span class="String">'</span><span class="String">Python</span><span class="String">'</span>.<span class="ID">lower</span>()</span>
<span class="line"><span class="anchor" id="line-3-20"></span></span>
<span class="line"><span class="anchor" id="line-4-16"></span><span class="ResWord">for</span> <span class="ID">num</span> <span class="ResWord">in</span> <span class="ResWord">range</span>(<span class="Number">100000</span>):</span>
<span class="line"><span class="anchor" id="line-5-13"></span> <span class="ID">doit3</span>()</span>
</pre></div></div><span class="anchor" id="line-440"></span><span class="anchor" id="line-441"></span><p class="line862">Here's the proof from <tt>timeit</tt>: <span class="anchor" id="line-442"></span><span class="anchor" id="line-443"></span><p class="line867"><span class="anchor" id="line-444"></span><span class="anchor" id="line-445"></span><span class="anchor" id="line-446"></span><span class="anchor" id="line-447"></span><span class="anchor" id="line-448"></span><span class="anchor" id="line-449"></span><span class="anchor" id="line-450"></span><span class="anchor" id="line-1-55"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-4a01a94813892348a0d0b6ce2d49365763c4bc77" lang="en"><span class="line"><span class="anchor" id="line-1-56"></span>&gt;&gt;&gt; <span class="ResWord">def</span> <span class="ID">doit3</span>():</span>
<span class="line"><span class="anchor" id="line-2-23"></span>... <span class="String">'</span><span class="String">Python</span><span class="String">'</span>.<span class="ID">lower</span>()</span>
<span class="line"><span class="anchor" id="line-3-21"></span>...</span>
<span class="line"><span class="anchor" id="line-4-17"></span>&gt;&gt;&gt; <span class="ID">t</span> = <span class="ID">timeit</span>.<span class="ID">Timer</span>(<span class="ID">setup</span>=<span class="String">'</span><span class="String">from __main__ import doit3</span><span class="String">'</span>, <span class="ID">stmt</span>=<span class="String">'</span><span class="String">doit3()</span><span class="String">'</span>)</span>
<span class="line"><span class="anchor" id="line-5-14"></span>&gt;&gt;&gt; <span class="ID">t</span>.<span class="ID">timeit</span>()</span>
<span class="line"><span class="anchor" id="line-6-9"></span><span class="Number">2.5606080293655396</span></span>
</pre></div></div><span class="anchor" id="line-451"></span><span class="anchor" id="line-452"></span><p class="line874">The above example is obviously a bit contrived, but the general <span class="anchor" id="line-453"></span>principle holds. <span class="anchor" id="line-454"></span><span class="anchor" id="line-455"></span><p class="line874">Note that putting an import in a function can speed up the initial loading <span class="anchor" id="line-456"></span>of the module, especially if the imported module might not be required. This <span class="anchor" id="line-457"></span>is generally a case of a "lazy" optimization -- avoiding work (importing a module, <span class="anchor" id="line-458"></span>which can be very expensive) until you are sure it is required. <span class="anchor" id="line-459"></span><span class="anchor" id="line-460"></span><p class="line874">This is only a significant saving in cases where the module wouldn't have been imported <span class="anchor" id="line-461"></span>at all (from any module) -- if the module is already loaded (as will be the case for many standard <span class="anchor" id="line-462"></span>modules, like <tt>string</tt> or <tt>re</tt>), avoiding an import doesn't save you anything. <span class="anchor" id="line-463"></span>To see what modules are loaded in the system look in <tt>sys.modules</tt>. <span class="anchor" id="line-464"></span><span class="anchor" id="line-465"></span><p class="line874">A good way to do lazy imports is: <span class="anchor" id="line-466"></span><span class="anchor" id="line-467"></span><p class="line867"><span class="anchor" id="line-468"></span><span class="anchor" id="line-469"></span><span class="anchor" id="line-470"></span><span class="anchor" id="line-471"></span><span class="anchor" id="line-472"></span><span class="anchor" id="line-473"></span><span class="anchor" id="line-474"></span><span class="anchor" id="line-475"></span><span class="anchor" id="line-1-57"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-7ad16013a0315f8a028de014231b09bbfcfabf57" lang="en"><span class="line"><span class="anchor" id="line-1-58"></span><span class="ID">email</span> = <span class="ResWord">None</span></span>
<span class="line"><span class="anchor" id="line-2-24"></span></span>
<span class="line"><span class="anchor" id="line-3-22"></span><span class="ResWord">def</span> <span class="ID">parse_email</span>():</span>
<span class="line"><span class="anchor" id="line-4-18"></span> <span class="ResWord">global</span> <span class="ID">email</span></span>
<span class="line"><span class="anchor" id="line-5-15"></span> <span class="ResWord">if</span> <span class="ID">email</span> <span class="ResWord">is</span> <span class="ResWord">None</span>:</span>
<span class="line"><span class="anchor" id="line-6-10"></span> <span class="ResWord">import</span> <span class="ID">email</span></span>
<span class="line"><span class="anchor" id="line-7-4"></span> ...</span>
</pre></div></div><span class="anchor" id="line-476"></span><span class="anchor" id="line-477"></span><p class="line862">This way the <tt>email</tt> module will only be imported once, on the first <span class="anchor" id="line-478"></span>invocation of <tt>parse_email()</tt>. <span class="anchor" id="line-479"></span><span class="anchor" id="line-480"></span><p class="line867">
<h2 id="Data_Aggregation">Data Aggregation</h2>
<span class="anchor" id="line-481"></span><span class="anchor" id="line-482"></span><p class="line874">Function call overhead in Python is relatively high, especially compared <span class="anchor" id="line-483"></span>with the execution speed of a builtin function. This strongly suggests <span class="anchor" id="line-484"></span>that where appropriate, functions should handle data aggregates. Here's <span class="anchor" id="line-485"></span>a contrived example written in Python. <span class="anchor" id="line-486"></span><span class="anchor" id="line-487"></span><p class="line867"><span class="anchor" id="line-488"></span><span class="anchor" id="line-489"></span><span class="anchor" id="line-490"></span><span class="anchor" id="line-491"></span><span class="anchor" id="line-492"></span><span class="anchor" id="line-493"></span><span class="anchor" id="line-494"></span><span class="anchor" id="line-495"></span><span class="anchor" id="line-496"></span><span class="anchor" id="line-497"></span><span class="anchor" id="line-498"></span><span class="anchor" id="line-499"></span><span class="anchor" id="line-500"></span><span class="anchor" id="line-1-59"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-ef09fec79730d2416277b6ba9907b0f7da5bc513" lang="en"><span class="line"><span class="anchor" id="line-1-60"></span><span class="ResWord">import</span> <span class="ID">time</span></span>
<span class="line"><span class="anchor" id="line-2-25"></span><span class="ID">x</span> = <span class="Number">0</span></span>
<span class="line"><span class="anchor" id="line-3-23"></span><span class="ResWord">def</span> <span class="ID">doit1</span>(<span class="ID">i</span>):</span>
<span class="line"><span class="anchor" id="line-4-19"></span> <span class="ResWord">global</span> <span class="ID">x</span></span>
<span class="line"><span class="anchor" id="line-5-16"></span> <span class="ID">x</span> = <span class="ID">x</span> + <span class="ID">i</span></span>
<span class="line"><span class="anchor" id="line-6-11"></span></span>
<span class="line"><span class="anchor" id="line-7-5"></span><span class="ResWord">list</span> = <span class="ResWord">range</span>(<span class="Number">100000</span>)</span>
<span class="line"><span class="anchor" id="line-8-3"></span><span class="ID">t</span> = <span class="ID">time</span>.<span class="ID">time</span>()</span>
<span class="line"><span class="anchor" id="line-9-3"></span><span class="ResWord">for</span> <span class="ID">i</span> <span class="ResWord">in</span> <span class="ResWord">list</span>:</span>
<span class="line"><span class="anchor" id="line-10-3"></span> <span class="ID">doit1</span>(<span class="ID">i</span>)</span>
<span class="line"><span class="anchor" id="line-11-3"></span></span>
<span class="line"><span class="anchor" id="line-12-3"></span><span class="ResWord">print</span> <span class="String">"</span><span class="String">%.3f</span><span class="String">"</span> % (<span class="ID">time</span>.<span class="ID">time</span>()-<span class="ID">t</span>)</span>
</pre></div></div><span class="anchor" id="line-501"></span><span class="anchor" id="line-502"></span><p class="line874">vs. <span class="anchor" id="line-503"></span><span class="anchor" id="line-504"></span><p class="line867"><span class="anchor" id="line-505"></span><span class="anchor" id="line-506"></span><span class="anchor" id="line-507"></span><span class="anchor" id="line-508"></span><span class="anchor" id="line-509"></span><span class="anchor" id="line-510"></span><span class="anchor" id="line-511"></span><span class="anchor" id="line-512"></span><span class="anchor" id="line-513"></span><span class="anchor" id="line-514"></span><span class="anchor" id="line-515"></span><span class="anchor" id="line-516"></span><span class="anchor" id="line-1-61"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-f3fe782098b5ab86de262f75e10897492d223ab3" lang="en"><span class="line"><span class="anchor" id="line-1-62"></span><span class="ResWord">import</span> <span class="ID">time</span></span>
<span class="line"><span class="anchor" id="line-2-26"></span><span class="ID">x</span> = <span class="Number">0</span></span>
<span class="line"><span class="anchor" id="line-3-24"></span><span class="ResWord">def</span> <span class="ID">doit2</span>(<span class="ResWord">list</span>):</span>
<span class="line"><span class="anchor" id="line-4-20"></span> <span class="ResWord">global</span> <span class="ID">x</span></span>
<span class="line"><span class="anchor" id="line-5-17"></span> <span class="ResWord">for</span> <span class="ID">i</span> <span class="ResWord">in</span> <span class="ResWord">list</span>:</span>
<span class="line"><span class="anchor" id="line-6-12"></span> <span class="ID">x</span> = <span class="ID">x</span> + <span class="ID">i</span></span>
<span class="line"><span class="anchor" id="line-7-6"></span></span>
<span class="line"><span class="anchor" id="line-8-4"></span><span class="ResWord">list</span> = <span class="ResWord">range</span>(<span class="Number">100000</span>)</span>
<span class="line"><span class="anchor" id="line-9-4"></span><span class="ID">t</span> = <span class="ID">time</span>.<span class="ID">time</span>()</span>
<span class="line"><span class="anchor" id="line-10-4"></span><span class="ID">doit2</span>(<span class="ResWord">list</span>)</span>
<span class="line"><span class="anchor" id="line-11-4"></span><span class="ResWord">print</span> <span class="String">"</span><span class="String">%.3f</span><span class="String">"</span> % (<span class="ID">time</span>.<span class="ID">time</span>()-<span class="ID">t</span>)</span>
</pre></div></div><span class="anchor" id="line-517"></span><span class="anchor" id="line-518"></span><p class="line874">Here's the proof in the pudding using an interactive session: <span class="anchor" id="line-519"></span><span class="anchor" id="line-520"></span><p class="line867"><span class="anchor" id="line-521"></span><span class="anchor" id="line-522"></span><span class="anchor" id="line-523"></span><span class="anchor" id="line-524"></span><span class="anchor" id="line-525"></span><span class="anchor" id="line-526"></span><span class="anchor" id="line-527"></span><span class="anchor" id="line-528"></span><span class="anchor" id="line-529"></span><span class="anchor" id="line-530"></span><span class="anchor" id="line-531"></span><span class="anchor" id="line-1-63"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-b45dd8623a995598e0e358b072738541bfc2ef07" lang="en"><span class="line"><span class="anchor" id="line-1-64"></span>&gt;&gt;&gt; <span class="ID">t</span> = <span class="ID">time</span>.<span class="ID">time</span>()</span>
<span class="line"><span class="anchor" id="line-2-27"></span>&gt;&gt;&gt; <span class="ResWord">for</span> <span class="ID">i</span> <span class="ResWord">in</span> <span class="ResWord">list</span>:</span>
<span class="line"><span class="anchor" id="line-3-25"></span>... <span class="ID">doit1</span>(<span class="ID">i</span>)</span>
<span class="line"><span class="anchor" id="line-4-21"></span>...</span>
<span class="line"><span class="anchor" id="line-5-18"></span>&gt;&gt;&gt; <span class="ResWord">print</span> <span class="String">"</span><span class="String">%.3f</span><span class="String">"</span> % (<span class="ID">time</span>.<span class="ID">time</span>()-<span class="ID">t</span>)</span>
<span class="line"><span class="anchor" id="line-6-13"></span><span class="Number">0.758</span></span>
<span class="line"><span class="anchor" id="line-7-7"></span>&gt;&gt;&gt; <span class="ID">t</span> = <span class="ID">time</span>.<span class="ID">time</span>()</span>
<span class="line"><span class="anchor" id="line-8-5"></span>&gt;&gt;&gt; <span class="ID">doit2</span>(<span class="ResWord">list</span>)</span>
<span class="line"><span class="anchor" id="line-9-5"></span>&gt;&gt;&gt; <span class="ResWord">print</span> <span class="String">"</span><span class="String">%.3f</span><span class="String">"</span> % (<span class="ID">time</span>.<span class="ID">time</span>()-<span class="ID">t</span>)</span>
<span class="line"><span class="anchor" id="line-10-5"></span><span class="Number">0.204</span></span>
</pre></div></div><span class="anchor" id="line-532"></span><span class="anchor" id="line-533"></span><p class="line874">Even written in Python, the second example runs about four times faster <span class="anchor" id="line-534"></span>than the first. Had <tt>doit</tt> been written in C the difference would likely <span class="anchor" id="line-535"></span>have been even greater (exchanging a Python <tt>for</tt> loop for a C <tt>for</tt> <span class="anchor" id="line-536"></span>loop as well as removing most of the function calls). <span class="anchor" id="line-537"></span><span class="anchor" id="line-538"></span><span class="anchor" id="line-539"></span><p class="line867">
<h2 id="Doing_Stuff_Less_Often">Doing Stuff Less Often</h2>
<span class="anchor" id="line-540"></span><span class="anchor" id="line-541"></span><p class="line874">The Python interpreter performs some periodic checks. In particular, it <span class="anchor" id="line-542"></span>decides whether or not to let another thread run and whether or not to <span class="anchor" id="line-543"></span>run a pending call (typically a call established by a signal handler). <span class="anchor" id="line-544"></span>Most of the time there's nothing to do, so performing these checks each <span class="anchor" id="line-545"></span>pass around the interpreter loop can slow things down. There is a <span class="anchor" id="line-546"></span>function in the <tt>sys</tt> module, <tt>setcheckinterval</tt>, which you can call to <span class="anchor" id="line-547"></span>tell the interpreter how often to perform these periodic checks. Prior <span class="anchor" id="line-548"></span>to the release of Python 2.3 it defaulted to 10. In 2.3 this was raised <span class="anchor" id="line-549"></span>to 100. If you aren't running with threads and you don't expect to be <span class="anchor" id="line-550"></span>catching many signals, setting this to a larger value can improve the <span class="anchor" id="line-551"></span>interpreter's performance, sometimes substantially. <span class="anchor" id="line-552"></span><span class="anchor" id="line-553"></span><span class="anchor" id="line-554"></span><p class="line867">
<h2 id="Python_is_not_C">Python is not C</h2>
<span class="anchor" id="line-555"></span><span class="anchor" id="line-556"></span><p class="line874">It is also not Perl, Java, C++ or Haskell. Be careful when transferring <span class="anchor" id="line-557"></span>your knowledge of how other languages perform to Python. A simple <span class="anchor" id="line-558"></span>example serves to demonstrate: <span class="anchor" id="line-559"></span><span class="anchor" id="line-560"></span><p class="line867"><span class="anchor" id="line-561"></span><span class="anchor" id="line-562"></span><span class="anchor" id="line-563"></span><span class="anchor" id="line-564"></span><span class="anchor" id="line-565"></span><span class="anchor" id="line-566"></span><span class="anchor" id="line-567"></span><pre><span class="anchor" id="line-1-1"></span>% timeit.py -s 'x = 47' 'x * 2'
<span class="anchor" id="line-2-1"></span>1000000 loops, best of 3: 0.574 usec per loop
<span class="anchor" id="line-3-1"></span>% timeit.py -s 'x = 47' 'x &lt;&lt; 1'
<span class="anchor" id="line-4-1"></span>1000000 loops, best of 3: 0.524 usec per loop
<span class="anchor" id="line-5-1"></span>% timeit.py -s 'x = 47' 'x + x'
<span class="anchor" id="line-6"></span>1000000 loops, best of 3: 0.382 usec per loop</pre><span class="anchor" id="line-568"></span><span class="anchor" id="line-569"></span><p class="line874">Now consider the similar C programs (only the add version is shown): <span class="anchor" id="line-570"></span><span class="anchor" id="line-571"></span><p class="line867"><span class="anchor" id="line-572"></span><span class="anchor" id="line-573"></span><span class="anchor" id="line-574"></span><span class="anchor" id="line-575"></span><span class="anchor" id="line-576"></span><span class="anchor" id="line-577"></span><span class="anchor" id="line-578"></span><span class="anchor" id="line-579"></span><span class="anchor" id="line-580"></span><span class="anchor" id="line-581"></span><span class="anchor" id="line-1-65"></span><div class="highlight cpp"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-a5cbacc78a2c71c9fe7f2ae7864b99f620d7d7cd" lang="en"><span class="line"><span class="anchor" id="line-1-66"></span><span class="Preprc">#</span><span class="Preprc">include &lt;stdio.h&gt;</span><span class="Preprc"></span></span>
<span class="line"><span class="anchor" id="line-2-28"></span><span class="Preprc"></span></span>
<span class="line"><span class="anchor" id="line-3-26"></span><span class="ResWord">int</span> <span class="ID">main</span> (<span class="ResWord">int</span> <span class="ID">argc</span>, <span class="ResWord">char</span> *<span class="ID">argv</span>[]) {</span>
<span class="line"><span class="anchor" id="line-4-22"></span> <span class="ResWord">int</span> <span class="ID">i</span> = <span class="Number">47</span>;</span>
<span class="line"><span class="anchor" id="line-5-19"></span> <span class="ResWord">int</span> <span class="ID">loop</span>;</span>
<span class="line"><span class="anchor" id="line-6-14"></span> <span class="ResWord">for</span> (<span class="ID">loop</span>=<span class="Number">0</span>; <span class="ID">loop</span>&lt;<span class="Number">500000000</span>; <span class="ID">loop</span>++)</span>
<span class="line"><span class="anchor" id="line-7-8"></span> <span class="ID">i</span> + <span class="ID">i</span>;</span>
<span class="line"><span class="anchor" id="line-8-6"></span> <span class="ResWord">return</span> <span class="Number">0</span>;</span>
<span class="line"><span class="anchor" id="line-9-6"></span>}</span>
</pre></div></div><span class="anchor" id="line-582"></span><span class="anchor" id="line-583"></span><p class="line874">and the execution times: <span class="anchor" id="line-584"></span><span class="anchor" id="line-585"></span><p class="line867"><span class="anchor" id="line-586"></span><span class="anchor" id="line-587"></span><span class="anchor" id="line-588"></span><span class="anchor" id="line-589"></span><span class="anchor" id="line-590"></span><span class="anchor" id="line-591"></span><span class="anchor" id="line-592"></span><span class="anchor" id="line-593"></span><span class="anchor" id="line-594"></span><span class="anchor" id="line-595"></span><span class="anchor" id="line-596"></span><span class="anchor" id="line-597"></span><span class="anchor" id="line-598"></span><span class="anchor" id="line-599"></span><span class="anchor" id="line-600"></span><span class="anchor" id="line-601"></span><span class="anchor" id="line-602"></span><span class="anchor" id="line-603"></span><span class="anchor" id="line-604"></span><pre><span class="anchor" id="line-1-2"></span>% for prog in mult add shift ; do
<span class="anchor" id="line-2-2"></span>&lt; for i in 1 2 3 ; do
<span class="anchor" id="line-3-2"></span>&lt; echo -n "$prog: "
<span class="anchor" id="line-4-2"></span>&lt; /usr/bin/time ./$prog
<span class="anchor" id="line-5-2"></span>&lt; done
<span class="anchor" id="line-6-1"></span>&lt; echo
<span class="anchor" id="line-7"></span>&lt; done
<span class="anchor" id="line-8"></span>mult: 6.12 real 5.64 user 0.01 sys
<span class="anchor" id="line-9"></span>mult: 6.08 real 5.50 user 0.04 sys
<span class="anchor" id="line-10"></span>mult: 6.10 real 5.45 user 0.03 sys
<span class="anchor" id="line-11"></span>
<span class="anchor" id="line-12"></span>add: 6.07 real 5.54 user 0.00 sys
<span class="anchor" id="line-13"></span>add: 6.08 real 5.60 user 0.00 sys
<span class="anchor" id="line-14"></span>add: 6.07 real 5.58 user 0.01 sys
<span class="anchor" id="line-15"></span>
<span class="anchor" id="line-16"></span>shift: 6.09 real 5.55 user 0.01 sys
<span class="anchor" id="line-17"></span>shift: 6.10 real 5.62 user 0.01 sys
<span class="anchor" id="line-18"></span>shift: 6.06 real 5.50 user 0.01 sys</pre><span class="anchor" id="line-605"></span><span class="anchor" id="line-606"></span><p class="line874">Note that there is a significant advantage in Python to adding a number <span class="anchor" id="line-607"></span>to itself instead of multiplying it by two or shifting it left by one <span class="anchor" id="line-608"></span>bit. In C on all modern computer architectures, each of the three <span class="anchor" id="line-609"></span>arithmetic operations are translated into a single machine instruction <span class="anchor" id="line-610"></span>which executes in one cycle, so it doesn't really matter which one you <span class="anchor" id="line-611"></span>choose. <span class="anchor" id="line-612"></span><span class="anchor" id="line-613"></span><p class="line874">A common "test" new Python programmers often perform is to translate the <span class="anchor" id="line-614"></span>common Perl idiom <span class="anchor" id="line-615"></span><span class="anchor" id="line-616"></span><p class="line867"><span class="anchor" id="line-617"></span><span class="anchor" id="line-618"></span><span class="anchor" id="line-619"></span><span class="anchor" id="line-620"></span><pre><span class="anchor" id="line-1-3"></span>while (&lt;&gt;) {
<span class="anchor" id="line-2-3"></span> print;
<span class="anchor" id="line-3-3"></span>}</pre><span class="anchor" id="line-621"></span><span class="anchor" id="line-622"></span><p class="line874">into Python code that looks something like <span class="anchor" id="line-623"></span><span class="anchor" id="line-624"></span><p class="line867"><span class="anchor" id="line-625"></span><span class="anchor" id="line-626"></span><span class="anchor" id="line-627"></span><span class="anchor" id="line-628"></span><span class="anchor" id="line-629"></span><span class="anchor" id="line-1-67"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-af1edf96d3b212266201b18a1f321315af67c8a4" lang="en"><span class="line"><span class="anchor" id="line-1-68"></span><span class="ResWord">import</span> <span class="ID">fileinput</span></span>
<span class="line"><span class="anchor" id="line-2-29"></span></span>
<span class="line"><span class="anchor" id="line-3-27"></span><span class="ResWord">for</span> <span class="ID">line</span> <span class="ResWord">in</span> <span class="ID">fileinput</span>.<span class="ID">input</span>():</span>
<span class="line"><span class="anchor" id="line-4-23"></span> <span class="ResWord">print</span> <span class="ID">line</span>,</span>
</pre></div></div><span class="anchor" id="line-630"></span><span class="anchor" id="line-631"></span><p class="line874">and use it to conclude that Python must be much slower than Perl. As <span class="anchor" id="line-632"></span>others have pointed out numerous times, Python is slower than Perl for <span class="anchor" id="line-633"></span>some things and faster for others. Relative performance also often <span class="anchor" id="line-634"></span>depends on your experience with the two languages. <span class="anchor" id="line-635"></span><span class="anchor" id="line-636"></span><p class="line867">
<h2 id="Use_xrange_instead_of_range">Use xrange instead of range</h2>
<span class="anchor" id="line-637"></span><span class="anchor" id="line-638"></span><p class="line867"><span class="anchor" id="line-639"></span><span class="anchor" id="line-640"></span><span class="anchor" id="line-641"></span><div class="tip"><span class="anchor" id="line-1-69"></span><p class="line862">This section no longer applies if you're using Python 3, where <tt class="backtick">range</tt> now provides an iterator over ranges of arbitrary size, and where <tt class="backtick">xrange</tt> no longer exists. </div><span class="anchor" id="line-642"></span><span class="anchor" id="line-643"></span><p class="line862">Python has two ways to get a range of numbers: <tt>range</tt> and <tt>xrange</tt>. Most people know about <tt>range</tt>, because of its obvious name. <tt>xrange</tt>, being way down near the end of the alphabet, is much less well-known. <span class="anchor" id="line-644"></span><span class="anchor" id="line-645"></span><p class="line867"><tt>xrange</tt> is a generator object, basically equivalent to the following Python 2.3 code: <span class="anchor" id="line-646"></span><span class="anchor" id="line-647"></span><p class="line867"><span class="anchor" id="line-648"></span><span class="anchor" id="line-649"></span><span class="anchor" id="line-650"></span><span class="anchor" id="line-651"></span><span class="anchor" id="line-652"></span><span class="anchor" id="line-653"></span><span class="anchor" id="line-654"></span><span class="anchor" id="line-655"></span><span class="anchor" id="line-656"></span><span class="anchor" id="line-657"></span><span class="anchor" id="line-658"></span><span class="anchor" id="line-659"></span><span class="anchor" id="line-660"></span><span class="anchor" id="line-1-70"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-7848a9cfa7bafd3d75f8152cef2fe8768bdb0236" lang="en"><span class="line"><span class="anchor" id="line-1-71"></span><span class="ResWord">def</span> <span class="ID">xrange</span>(<span class="ID">start</span>, <span class="ID">stop</span>=<span class="ResWord">None</span>, <span class="ID">step</span>=<span class="Number">1</span>):</span>
<span class="line"><span class="anchor" id="line-2-30"></span> <span class="ResWord">if</span> <span class="ID">stop</span> <span class="ResWord">is</span> <span class="ResWord">None</span>:</span>
<span class="line"><span class="anchor" id="line-3-28"></span> <span class="ID">stop</span> = <span class="ID">start</span></span>
<span class="line"><span class="anchor" id="line-4-24"></span> <span class="ID">start</span> = <span class="Number">0</span></span>
<span class="line"><span class="anchor" id="line-5-20"></span> <span class="ResWord">else</span>:</span>
<span class="line"><span class="anchor" id="line-6-15"></span> <span class="ID">stop</span> = <span class="ResWord">int</span>(<span class="ID">stop</span>)</span>
<span class="line"><span class="anchor" id="line-7-9"></span> <span class="ID">start</span> = <span class="ResWord">int</span>(<span class="ID">start</span>)</span>
<span class="line"><span class="anchor" id="line-8-7"></span> <span class="ID">step</span> = <span class="ResWord">int</span>(<span class="ID">step</span>)</span>
<span class="line"><span class="anchor" id="line-9-7"></span></span>
<span class="line"><span class="anchor" id="line-10-6"></span> <span class="ResWord">while</span> <span class="ID">start</span> &lt; <span class="ID">stop</span>:</span>
<span class="line"><span class="anchor" id="line-11-5"></span> <span class="ResWord">yield</span> <span class="ID">start</span></span>
<span class="line"><span class="anchor" id="line-12-4"></span> <span class="ID">start</span> += <span class="ID">step</span></span>
</pre></div></div><span class="anchor" id="line-661"></span><span class="anchor" id="line-662"></span><p class="line874">Except that it is implemented in pure C. <span class="anchor" id="line-663"></span><span class="anchor" id="line-664"></span><p class="line867"><tt>xrange</tt> does have limitations. Specifically, it only works with <tt>int</tt>s; you cannot use <tt>long</tt>s or <tt>float</tt>s (they will be converted to <tt>int</tt>s, as shown above). <span class="anchor" id="line-665"></span><span class="anchor" id="line-666"></span><p class="line862">It does, however, save gobs of memory, and unless you store the yielded objects somewhere, only one yielded object will exist at a time. The difference is thus: When you call <tt>range</tt>, it creates a <tt>list</tt> containing so many number (<tt>int</tt>, <tt>long</tt>, or <tt>float</tt>) objects. All of those objects are created at once, and all of them exist at the same time. This can be a pain when the number of numbers is large. <span class="anchor" id="line-667"></span><span class="anchor" id="line-668"></span><p class="line867"><tt>xrange</tt>, on the other hand, creates <em>no</em> numbers immediately - only the range object itself. Number objects are created only when you pull on the generator, e.g. by looping through it. For example: <span class="anchor" id="line-669"></span><span class="anchor" id="line-670"></span><p class="line867"><span class="anchor" id="line-671"></span><span class="anchor" id="line-672"></span><span class="anchor" id="line-1-72"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-96aac040d3460ad1f88f051a9f01afb5c74e5959" lang="en"><span class="line"><span class="anchor" id="line-1-73"></span><span class="ResWord">xrange</span>(<span class="ID">sys</span>.<span class="ID">maxint</span>) <span class="Comment"># No loop, and no call to .next, so no numbers are instantiated</span></span>
</pre></div></div><span class="anchor" id="line-673"></span><span class="anchor" id="line-674"></span><p class="line862">And for this reason, the code runs instantly. If you substitute <tt>range</tt> there, Python will lock up; it will be too busy allocating <tt>sys.maxint</tt> number objects (about 2.1 billion on the typical PC) to do anything else. Eventually, it will run out of memory and exit. <span class="anchor" id="line-675"></span><span class="anchor" id="line-676"></span><p class="line862">In Python versions before 2.2, <tt>xrange</tt> objects also supported optimizations such as fast membership testing (<tt>i&nbsp;in&nbsp;xrange(n)</tt>). These features were removed in 2.2 due to lack of use. <span class="anchor" id="line-677"></span><span class="anchor" id="line-678"></span><p class="line867">
<h2 id="Re-map_Functions_at_runtime">Re-map Functions at runtime</h2>
<span class="anchor" id="line-679"></span><span class="anchor" id="line-680"></span><p class="line874">Say you have a function <span class="anchor" id="line-681"></span><span class="anchor" id="line-682"></span><p class="line867"><span class="anchor" id="line-683"></span><span class="anchor" id="line-684"></span><span class="anchor" id="line-685"></span><span class="anchor" id="line-686"></span><span class="anchor" id="line-687"></span><span class="anchor" id="line-688"></span><span class="anchor" id="line-689"></span><span class="anchor" id="line-690"></span><span class="anchor" id="line-691"></span><span class="anchor" id="line-692"></span><span class="anchor" id="line-693"></span><span class="anchor" id="line-694"></span><span class="anchor" id="line-695"></span><span class="anchor" id="line-696"></span><span class="anchor" id="line-697"></span><span class="anchor" id="line-1-74"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-c917e5f4c76cb477425baf3fda31185b1e40a1bf" lang="en"><span class="line"><span class="anchor" id="line-1-75"></span> <span class="ResWord">class</span> <span class="ID">Test</span>:</span>
<span class="line"><span class="anchor" id="line-2-31"></span> <span class="ResWord">def</span> <span class="ID">check</span>(<span class="ResWord">self</span>,<span class="ID">a</span>,<span class="ID">b</span>,<span class="ID">c</span>):</span>
<span class="line"><span class="anchor" id="line-3-29"></span> <span class="ResWord">if</span> <span class="ID">a</span> == <span class="Number">0</span>:</span>
<span class="line"><span class="anchor" id="line-4-25"></span> <span class="ResWord">self</span>.<span class="ID">str</span> = <span class="ID">b</span>*<span class="Number">100</span></span>
<span class="line"><span class="anchor" id="line-5-21"></span> <span class="ResWord">else</span>:</span>
<span class="line"><span class="anchor" id="line-6-16"></span> <span class="ResWord">self</span>.<span class="ID">str</span> = <span class="ID">c</span>*<span class="Number">100</span></span>
<span class="line"><span class="anchor" id="line-7-10"></span></span>
<span class="line"><span class="anchor" id="line-8-8"></span> <span class="ID">a</span> = <span class="ID">Test</span>()</span>
<span class="line"><span class="anchor" id="line-9-8"></span> <span class="ResWord">def</span> <span class="ID">example</span>():</span>
<span class="line"><span class="anchor" id="line-10-7"></span> <span class="ResWord">for</span> <span class="ID">i</span> <span class="ResWord">in</span> <span class="ResWord">xrange</span>(<span class="Number">0</span>,<span class="Number">100000</span>):</span>
<span class="line"><span class="anchor" id="line-11-6"></span> <span class="ID">a</span>.<span class="ID">check</span>(<span class="ID">i</span>,<span class="String">"</span><span class="String">b</span><span class="String">"</span>,<span class="String">"</span><span class="String">c</span><span class="String">"</span>)</span>
<span class="line"><span class="anchor" id="line-12-5"></span></span>
<span class="line"><span class="anchor" id="line-13-2"></span> <span class="ResWord">import</span> <span class="ID">profile</span></span>
<span class="line"><span class="anchor" id="line-14-2"></span> <span class="ID">profile</span>.<span class="ID">run</span>(<span class="String">"</span><span class="String">example()</span><span class="String">"</span>)</span>
</pre></div></div><span class="anchor" id="line-698"></span><span class="anchor" id="line-699"></span><p class="line874">And suppose this function gets called from somewhere else many times. <span class="anchor" id="line-700"></span><span class="anchor" id="line-701"></span><p class="line874">Well, your check will have an if statement slowing you down all the time except the first time, so you can do this: <span class="anchor" id="line-702"></span><span class="anchor" id="line-703"></span><p class="line867"><span class="anchor" id="line-704"></span><span class="anchor" id="line-705"></span><span class="anchor" id="line-706"></span><span class="anchor" id="line-707"></span><span class="anchor" id="line-708"></span><span class="anchor" id="line-709"></span><span class="anchor" id="line-710"></span><span class="anchor" id="line-711"></span><span class="anchor" id="line-712"></span><span class="anchor" id="line-713"></span><span class="anchor" id="line-714"></span><span class="anchor" id="line-715"></span><span class="anchor" id="line-716"></span><span class="anchor" id="line-717"></span><span class="anchor" id="line-718"></span><span class="anchor" id="line-1-76"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-bed4de0b660ca8fb61e8bc7f41ea72ef340ac036" lang="en"><span class="line"><span class="anchor" id="line-1-77"></span> <span class="ResWord">class</span> <span class="ID">Test2</span>:</span>
<span class="line"><span class="anchor" id="line-2-32"></span> <span class="ResWord">def</span> <span class="ID">check</span>(<span class="ResWord">self</span>,<span class="ID">a</span>,<span class="ID">b</span>,<span class="ID">c</span>):</span>
<span class="line"><span class="anchor" id="line-3-30"></span> <span class="ResWord">self</span>.<span class="ID">str</span> = <span class="ID">b</span>*<span class="Number">100</span></span>
<span class="line"><span class="anchor" id="line-4-26"></span> <span class="ResWord">self</span>.<span class="ID">check</span> = <span class="ResWord">self</span>.<span class="ID">check_post</span></span>
<span class="line"><span class="anchor" id="line-5-22"></span> <span class="ResWord">def</span> <span class="ID">check_post</span>(<span class="ResWord">self</span>,<span class="ID">a</span>,<span class="ID">b</span>,<span class="ID">c</span>):</span>
<span class="line"><span class="anchor" id="line-6-17"></span> <span class="ResWord">self</span>.<span class="ID">str</span> = <span class="ID">c</span>*<span class="Number">100</span></span>
<span class="line"><span class="anchor" id="line-7-11"></span></span>
<span class="line"><span class="anchor" id="line-8-9"></span> <span class="ID">a</span> = <span class="ID">Test2</span>()</span>
<span class="line"><span class="anchor" id="line-9-9"></span> <span class="ResWord">def</span> <span class="ID">example2</span>():</span>
<span class="line"><span class="anchor" id="line-10-8"></span> <span class="ResWord">for</span> <span class="ID">i</span> <span class="ResWord">in</span> <span class="ResWord">xrange</span>(<span class="Number">0</span>,<span class="Number">100000</span>):</span>
<span class="line"><span class="anchor" id="line-11-7"></span> <span class="ID">a</span>.<span class="ID">check</span>(<span class="ID">i</span>,<span class="String">"</span><span class="String">b</span><span class="String">"</span>,<span class="String">"</span><span class="String">c</span><span class="String">"</span>)</span>
<span class="line"><span class="anchor" id="line-12-6"></span></span>
<span class="line"><span class="anchor" id="line-13-3"></span> <span class="ResWord">import</span> <span class="ID">profile</span></span>
<span class="line"><span class="anchor" id="line-14-3"></span> <span class="ID">profile</span>.<span class="ID">run</span>(<span class="String">"</span><span class="String">example2()</span><span class="String">"</span>)</span>
</pre></div></div><span class="anchor" id="line-719"></span><span class="anchor" id="line-720"></span><p class="line874">Well, this example is fairly inadequate, but if the 'if' statement is a pretty complicated expression (or something with lots of dots), you can save yourself evaluating it, if you know it will only be true the first time. <span class="anchor" id="line-721"></span><span class="anchor" id="line-722"></span><p class="line867"><span class="anchor" id="Profiling"></span> <span class="anchor" id="line-723"></span>
<h2 id="Profiling_Code">Profiling Code</h2>
<span class="anchor" id="line-724"></span><span class="anchor" id="line-725"></span><p class="line874">The first step to speeding up your program is learning where the <span class="anchor" id="line-726"></span>bottlenecks lie. It hardly makes sense to optimize code that is never <span class="anchor" id="line-727"></span>executed or that already runs fast. I use two modules to help locate the <span class="anchor" id="line-728"></span>hotspots in my code, profile and trace. In later examples I also use the <span class="anchor" id="line-729"></span><tt>timeit</tt> module, which is new in Python 2.3. <span class="anchor" id="line-730"></span><span class="anchor" id="line-731"></span><p class="line867"><img alt="(!)" height="16" src="/moin/moin_static193/europython/img/idea.png" title="(!)" width="16" /> See the separate <a href="/moin/PythonSpeed/Profiling">profiling</a> document for alternatives to the approaches given below. <span class="anchor" id="line-732"></span><span class="anchor" id="line-733"></span><p class="line867">
<h3 id="Profiling-1">Profiling</h3>
<span class="anchor" id="line-734"></span><span class="anchor" id="line-735"></span><p class="line862">There are a number of <a class="http" href="http://docs.python.org/library/profile.html">profiling modules</a> included in the Python distribution. Using one of these to profile the execution of a set of functions is quite easy. Suppose your main function is called <tt class="backtick">main</tt>, takes no arguments and you want to execute it under the control of the <tt class="backtick">profile</tt> module. In its simplest form you just execute <span class="anchor" id="line-736"></span><span class="anchor" id="line-737"></span><p class="line867"><span class="anchor" id="line-738"></span><span class="anchor" id="line-739"></span><span class="anchor" id="line-740"></span><span class="anchor" id="line-1-78"></span><div class="highlight python"><div class="codearea" dir="ltr" lang="en"><pre dir="ltr" id="CA-148d58266321064d446bc043ac522ecbef08bd44" lang="en"><span class="line"><span class="anchor" id="line-1-79"></span><span class="ResWord">import</span> <span class="ID">profile</span></span>
<span class="line"><span class="anchor" id="line-2-33"></span><span class="ID">profile</span>.<span class="ID">run</span>(<span class="String">'</span><span class="String">main()</span><span class="String">'</span>)</span>
</pre></div></div><span class="anchor" id="line-741"></span><span class="anchor" id="line-742"></span><p class="line862">When <tt class="backtick">main()</tt> returns, the <tt class="backtick">profile</tt> module will print a table of function <span class="anchor" id="line-743"></span>calls and execution times. The output can be tweaked using the <tt class="backtick">Stats</tt> <span class="anchor" id="line-744"></span>class included with the module. From Python 2.4 <tt class="backtick">profile</tt> has permitted the <span class="anchor" id="line-745"></span>time consumed by Python builtins and functions in extension modules to be <span class="anchor" id="line-746"></span>profiled as well. <span class="anchor" id="line-747"></span><span class="anchor" id="line-748"></span><p class="line862">A slightly longer description of profiling using the <tt class="backtick">profile</tt> and <tt class="backtick">pstats</tt> modules can be found here (archived version): <span class="anchor" id="line-749"></span><span class="anchor" id="line-750"></span><p class="line867"><a class="http" href="http://web.archive.org/web/20060506162444/http://wingware.com/doc/howtos/performance-profiling-python-code">http://web.archive.org/web/20060506162444/http://wingware.com/doc/howtos/performance-profiling-python-code</a> <span class="anchor" id="line-751"></span><span class="anchor" id="line-752"></span><p class="line867">
<h3 id="The_cProfile_and_Hotshot_Modules">The cProfile and Hotshot Modules</h3>
<span class="anchor" id="line-753"></span><span class="anchor" id="line-754"></span><p class="line862">Since Python 2.2, the <a class="http" href="http://www.python.org/doc/current/lib/module-hotshot.html">hotshot package</a> has been available as a replacement for the <tt class="backtick">profile</tt> module, although the <tt class="backtick">cProfile</tt> module is now recommended in preference to <tt class="backtick">hotshot</tt>. The underlying module is written in C, so using <tt class="backtick">hotshot</tt> (or <tt class="backtick">cProfile</tt>) should result in a much smaller performance hit, and thus a more accurate idea of how your application is performing. There is also a <tt class="backtick">hotshotmain.py</tt> program in the distribution's <tt class="backtick">Tools/scripts</tt> directory which makes it easy to run your program under <tt class="backtick">hotshot</tt> control from the command line. <span class="anchor" id="line-755"></span><span class="anchor" id="line-756"></span><span class="anchor" id="line-757"></span><p class="line867">
<h3 id="Trace_Module">Trace Module</h3>
<span class="anchor" id="line-758"></span><span class="anchor" id="line-759"></span><p class="line874">The <span class="anchor" id="line-760"></span><a class="http" href="http://www.python.org/doc/current/lib/module-trace.html">trace module</a> <span class="anchor" id="line-761"></span>is a spin-off of the profile module I wrote originally <span class="anchor" id="line-762"></span>to perform some crude statement level test coverage. It's been heavily <span class="anchor" id="line-763"></span>modified by several other people since I released my initial crude <span class="anchor" id="line-764"></span>effort. As of Python 2.0 you should find trace.py in the Tools/scripts <span class="anchor" id="line-765"></span>directory of the Python distribution. Starting with Python 2.3 it's in <span class="anchor" id="line-766"></span>the standard library (the Lib directory). You can copy it to your local <span class="anchor" id="line-767"></span>bin directory and set the execute permission, then execute it directly. <span class="anchor" id="line-768"></span>It's easy to run from the command line to trace execution of whole scripts: <span class="anchor" id="line-769"></span><span class="anchor" id="line-770"></span><p class="line867"><span class="anchor" id="line-771"></span><span class="anchor" id="line-772"></span><pre><span class="anchor" id="line-1-4"></span>% trace.py -t spam.py eggs</pre><span class="anchor" id="line-773"></span><span class="anchor" id="line-774"></span><p class="line862">In Python 2.4 it's even easier to run. Just execute <tt>python&nbsp;-m&nbsp;trace</tt>. <span class="anchor" id="line-775"></span><span class="anchor" id="line-776"></span><p class="line874">There's no separate documentation, but you can execute "pydoc trace" to <span class="anchor" id="line-777"></span>view the inline documentation. <span class="anchor" id="line-778"></span><span class="anchor" id="line-779"></span><p class="line867">
<h3 id="Visualizing_Profiling_Results">Visualizing Profiling Results</h3>
<span class="anchor" id="line-780"></span><span class="anchor" id="line-781"></span><p class="line867"><a class="http" href="http://www.vrplumber.com/programming/runsnakerun/">RunSnakeRun</a> is a GUI tool by Mike Fletcher which visualizes profile dumps from cProfile using square maps. Function/method calls may be sorted according to various criteria, and source code may be displayed alongside the visualization and call statistics. <span class="anchor" id="line-782"></span><span class="anchor" id="line-783"></span><p class="line874">An example usage: <span class="anchor" id="line-784"></span><span class="anchor" id="line-785"></span><span class="anchor" id="line-786"></span><pre><span class="anchor" id="line-1-5"></span>runsnake some_profile_dump.prof</pre><span class="anchor" id="line-787"></span><span class="anchor" id="line-788"></span><p class="line867"><a class="http" href="http://code.google.com/p/jrfonseca/wiki/Gprof2Dot">Gprof2Dot</a> is a python based tool that can transform profiling results output into a graph that can be converted into a PNG image or SVG. <span class="anchor" id="line-789"></span><span class="anchor" id="line-790"></span><p class="line874">A typical profiling session with python 2.5 looks like this (on older platforms you will need to use actual script instead of the -m option): <span class="anchor" id="line-791"></span><span class="anchor" id="line-792"></span><span class="anchor" id="line-793"></span><span class="anchor" id="line-794"></span><span class="anchor" id="line-795"></span><pre><span class="anchor" id="line-1-6"></span>python -m cProfile -o stat.prof MYSCRIPY.PY [ARGS...]
<span class="anchor" id="line-2-4"></span>python -m pbp.scripts.gprof2dot -f pstats -o stat.dot stat.prof
<span class="anchor" id="line-3-4"></span>dot -ostat.png -Tpng stat.dot</pre><span class="anchor" id="line-796"></span><span class="anchor" id="line-797"></span><p class="line867"><a class="http" href="http://pycallgraph.slowchop.com/">PyCallGraph</a> pycallgraph is a Python module that creates call graphs for Python programs. It generates a PNG file showing an modules's function calls and their link to other function calls, the amount of times a function was called and the time spent in that function. <span class="anchor" id="line-798"></span><span class="anchor" id="line-799"></span><p class="line874">Typical usage: <span class="anchor" id="line-800"></span><span class="anchor" id="line-801"></span><span class="anchor" id="line-802"></span><pre><span class="anchor" id="line-1-7"></span>pycallgraph scriptname.py</pre><span class="anchor" id="line-803"></span><p class="line867"><hr /><p class="line874"> <span class="anchor" id="line-804"></span><a href="/moin/CategoryDocumentation">CategoryDocumentation</a> <span class="anchor" id="line-805"></span><span class="anchor" id="bottom"></span></div><p id="pageinfo" class="info" lang="en" dir="ltr">PythonSpeed/PerformanceTips (last edited 2011-08-26 06:35:42 by <span title="StefanBehnel @ dslb-084-056-020-155.pools.arcor-ip.net[84.56.20.155]"><a href="/moin/StefanBehnel" title="StefanBehnel @ dslb-084-056-020-155.pools.arcor-ip.net[84.56.20.155]">StefanBehnel</a></span>)</p>
<div id="pagebottom"></div>
</div>
<div id="footer">
<ul id="credits">
<li><a href="http://moinmo.in/" title="This site uses the MoinMoin Wiki software.">MoinMoin Powered</a></li><li><a href="http://moinmo.in/Python" title="MoinMoin is written in Python.">Python Powered</a></li><li><a href="http://moinmo.in/GPL" title="MoinMoin is GPL licensed.">GPL licensed</a></li><li><a href="http://validator.w3.org/check?uri=referer" title="Click here to validate this page.">Valid HTML 4.01</a></li>
</ul>
</div>
</body>
</html>
Loading…
Cancel
Save