<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Anrs Hu &#187; unpack</title>
	<atom:link href="http://anrs.sacredfir.com/archives/tag/unpack/feed" rel="self" type="application/rss+xml" />
	<link>http://anrs.sacredfir.com</link>
	<description>你所知的一切...</description>
	<lastBuildDate>Sun, 13 Dec 2009 10:34:27 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.8.4</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>unpack 的三种实现</title>
		<link>http://anrs.sacredfir.com/archives/108</link>
		<comments>http://anrs.sacredfir.com/archives/108#comments</comments>
		<pubDate>Tue, 08 Sep 2009 14:13:25 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[Coding]]></category>
		<category><![CDATA[unpack]]></category>
		<category><![CDATA[性能]]></category>

		<guid isPermaLink="false">http://anrs.sacredfir.com/?p=108</guid>
		<description><![CDATA[解析文件或者网络数据时，基本的操作就是 unpack 流内容，下面演示了三种不同的 unpack 手法，分别使用了 re, struct.unpack 和 list slice，最后是三种手法的性能比较。
代码：




#!/usr/bin/env python


def split_by_unpack&#40;txt&#41;:


&#160; &#160; import struct


&#160; &#160; baseformat = &#8216;5s 3x 8s 8s&#8217;


&#160; &#160; format = &#8216;%s %ds&#8217; % &#40;baseformat, len&#40;txt&#41; &#8211; struct.calcsize&#40;baseformat&#41;&#41;


&#160; &#160; return struct.unpack&#40;format, txt&#41;


&#160;


def _split_by_re&#40;&#41;:


&#160; &#160; import re


&#160; &#160; regex = re.compile&#40;r&#8216;(.{5})(?:.{3})(.{8})(.{8})(.*)&#8217;, re.I&#41;


&#160; &#160; def __split_by_re&#40;txt&#41;:


&#160; &#160; &#160; &#160; r = regex.match&#40;txt&#41;


&#160; &#160; [...]]]></description>
			<content:encoded><![CDATA[<p><span style="font-size: 16px;">解析文件或者网络数据时，基本的操作就是 unpack 流内容，下面演示了三种不同的 unpack 手法，分别使用了 re, struct.unpack 和 list slice，最后是三种手法的性能比较。</span></p>
<p><span style="font-size: 16px;">代码：</span></p>
<p><span>
<div class="dean_ch" style="white-space: wrap;">
<ol>
<li class="li1">
<div class="de1"><span class="co1">#!/usr/bin/env python</span></div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">def</span> split_by_unpack<span class="br0">&#40;</span>txt<span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">import</span> <span class="kw3">struct</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; baseformat = <span class="st0">&#8216;5s 3x 8s 8s&#8217;</span></div>
</li>
<li class="li2">
<div class="de2">&nbsp; &nbsp; format = <span class="st0">&#8216;%s %ds&#8217;</span> % <span class="br0">&#40;</span>baseformat, <span class="kw2">len</span><span class="br0">&#40;</span>txt<span class="br0">&#41;</span> &#8211; <span class="kw3">struct</span>.<span class="me1">calcsize</span><span class="br0">&#40;</span>baseformat<span class="br0">&#41;</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">return</span> <span class="kw3">struct</span>.<span class="me1">unpack</span><span class="br0">&#40;</span>format, txt<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">def</span> _split_by_re<span class="br0">&#40;</span><span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">import</span> <span class="kw3">re</span></div>
</li>
<li class="li2">
<div class="de2">&nbsp; &nbsp; regex = <span class="kw3">re</span>.<span class="kw2">compile</span><span class="br0">&#40;</span>r<span class="st0">&#8216;(.{5})(?:.{3})(.{8})(.{8})(.*)&#8217;</span>, <span class="kw3">re</span>.<span class="me1">I</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">def</span> __split_by_re<span class="br0">&#40;</span>txt<span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; &nbsp; r = regex.<span class="me1">match</span><span class="br0">&#40;</span>txt<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; &nbsp; <span class="kw1">return</span> <span class="br0">&#40;</span>r.<span class="me1">group</span><span class="br0">&#40;</span><span class="nu0">1</span><span class="br0">&#41;</span>, r.<span class="me1">group</span><span class="br0">&#40;</span><span class="nu0">2</span><span class="br0">&#41;</span>, r.<span class="me1">group</span><span class="br0">&#40;</span><span class="nu0">3</span><span class="br0">&#41;</span>, r.<span class="me1">group</span><span class="br0">&#40;</span><span class="nu0">4</span><span class="br0">&#41;</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; <span class="kw1">return</span> __split_by_re</div>
</li>
<li class="li2">
<div class="de2">split_by_re = _split_by_re<span class="br0">&#40;</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">def</span> split_by_slice<span class="br0">&#40;</span>txt<span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; cuts = <span class="br0">&#91;</span><span class="nu0">5</span>, <span class="nu0">8</span>, <span class="nu0">16</span>, <span class="nu0">24</span><span class="br0">&#93;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; pieces = <span class="br0">&#91;</span>txt<span class="br0">&#91;</span>i:j<span class="br0">&#93;</span> <span class="kw1">for</span> i, j <span class="kw1">in</span> <span class="kw2">zip</span><span class="br0">&#40;</span><span class="br0">&#91;</span><span class="nu0">0</span><span class="br0">&#93;</span> + cuts, cuts + <span class="br0">&#91;</span><span class="kw2">None</span><span class="br0">&#93;</span><span class="br0">&#41;</span><span class="br0">&#93;</span></div>
</li>
<li class="li2">
<div class="de2">&nbsp; &nbsp; <span class="kw1">return</span> <span class="br0">&#40;</span>pieces<span class="br0">&#91;</span><span class="nu0">0</span><span class="br0">&#93;</span>, pieces<span class="br0">&#91;</span><span class="nu0">2</span><span class="br0">&#93;</span>, pieces<span class="br0">&#91;</span><span class="nu0">3</span><span class="br0">&#93;</span>, pieces<span class="br0">&#91;</span><span class="nu0">4</span><span class="br0">&#93;</span><span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
<li class="li1">
<div class="de1"><span class="kw1">if</span> __name__ == <span class="st0">&#8216;__main__&#8217;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; txt = <span class="st0">&#8216;12345###abcdefghABCDEFGH###&#8217;</span></div>
</li>
<li class="li2">
<div class="de2">&nbsp; &nbsp; <span class="kw1">for</span> i <span class="kw1">in</span> <span class="kw2">xrange</span><span class="br0">&#40;</span><span class="nu0">1000000</span><span class="br0">&#41;</span>:</div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; &nbsp; split_by_slice<span class="br0">&#40;</span>txt<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; &nbsp; split_by_unpack<span class="br0">&#40;</span>txt<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp; &nbsp; &nbsp; &nbsp; split_by_re<span class="br0">&#40;</span>txt<span class="br0">&#41;</span></div>
</li>
<li class="li1">
<div class="de1">&nbsp;</div>
</li>
</ol>
</div>
<p></span></p>
<p><span style="font-size: 16px;">上面代码演示了将文本 unpack 为长度分别是 5, 3(忽略), 8, 8, <font style="color:red">k</font> 的子串，下面是对三个函数进行百万次调用后的性能分析：</span></p>
<p><span><img src="http://anrs.sacredfir.com/wp-content/uploads/2009/09/screenshot_052.png" alt="screenshot_052" title="screenshot_052" width="650" height="85" class="alignnone size-full wp-image-115" /></span></p>
<p><span style="font-size: 16px;">其实三种手法还是有一些差异的，struct.unpack 是基于 bytes 来拆分，而 re 和 list slice 是基于字符拆分的，所以在非 ASCII 的情况下要特别注意。</span></p>
]]></content:encoded>
			<wfw:commentRss>http://anrs.sacredfir.com/archives/108/feed</wfw:commentRss>
		<slash:comments>2</slash:comments>
		</item>
	</channel>
</rss>
