[Inteproxy-commits] r355 - in branches/compression: . inteproxy

Wed Feb 29 10:57:32 CET 2012

Author: aheinecke
Date: 2012-02-29 10:57:31 +0100 (Wed, 29 Feb 2012)
New Revision: 355

Added:
   branches/compression/inteproxy/decompressstream.py
Modified:
   branches/compression/ChangeLog
   branches/compression/inteproxy/httpmessage.py
   branches/compression/inteproxy/proxycore.py
Log:
Refactor compression, HTTPMessage now reads from an
internal _body_stream which is a DecompressStream
in the case that do_decompress is called before reading.

do_decompress also now modifies the header to remove
the Content-Encoding and lenght.

The body stream in httpmessage is now also set
when the body is set.



Modified: branches/compression/ChangeLog
===================================================================

--- branches/compression/ChangeLog	2012-02-24 16:04:28 UTC (rev 354)
+++ branches/compression/ChangeLog	2012-02-29 09:57:31 UTC (rev 355)
@@ -1,3 +1,16 @@
+2012-02-29	Andre Heinecke	<aheinecke at intevation.de>
+	* M inteproxy/proxycore.py:
+	  Remove decompressed_read method.
+	  Remove method get_decompress_object
+	* M inteproxy/httpmessage.py:
+	  Added do_decompress method to httpresponse to select decompression 
+	  of the response.
+	  Read does now create a decompression object if necessary to decompress
+	  the input stream of the response.
+	* A inteproxy/decompressstream.py:
+	  Add decompress stream class to wrap around an input stream for
+	  decompressed reading.
+
 2012-02-24	Andre Heinecke	<aheinecke at intevation.de>
 	* M inteproxy/proxycore.py:
 	  Add method decompressed_read to read decompressed

Added: branches/compression/inteproxy/decompressstream.py
===================================================================
--- branches/compression/inteproxy/decompressstream.py	                        (rev 0)
+++ branches/compression/inteproxy/decompressstream.py	2012-02-29 09:57:31 UTC (rev 355)
@@ -0,0 +1,41 @@
+# Copyright (C) 2012 by Intevation GmbH
+# Authors:
+# Bernhard Herzog <bh at intevation.de>
+# Andre Heinecke <aheinecke at intevation.de>
+#
+# This program is free software under the GPL (>=v2)
+# Read the file COPYING coming with the software for details.
+
+""" On the fly decompression of a data stream """
+
+class DecompressStream(object):
+    """A class to wrap around a data stream that contains
+    compressed data.
+
+    The decompression object can be given at construction time.
+    """
+
+    def __init__(self, infile, decompressobj):
+        """
+        Initialize the DecompressStream with a input stream and a decompressor
+        """
+        self.infile = infile
+        self.decompressor = decompressobj
+
+    def read(self, amount):
+        if amount == -1:
+            return self.decompressor.decompress(self.infile.read())
+        decompressed_chunks = []
+        count = 0
+        while count < amount:
+            max_read = amount - count
+            compressed = self.decompressor.unconsumed_tail
+            if not compressed:
+                compressed = self.infile.read(amount)
+                if not compressed:
+                    break
+            deflated = self.decompressor.decompress(compressed, max_read)
+            count += len(deflated)
+            decompressed_chunks.append(deflated)
+
+        return "".join(decompressed_chunks)

Modified: branches/compression/inteproxy/httpmessage.py
===================================================================
--- branches/compression/inteproxy/httpmessage.py	2012-02-24 16:04:28 UTC (rev 354)
+++ branches/compression/inteproxy/httpmessage.py	2012-02-29 09:57:31 UTC (rev 355)
@@ -8,6 +8,8 @@
 """Abstractions for http request and response messages"""
 
 from StringIO import StringIO
+from inteproxy.decompressstream import DecompressStream
+from zlib import decompressobj, MAX_WBITS
 
 class HTTPMessage(object):
 
@@ -66,6 +68,7 @@
         if content_type is not None:
             self.headers["Content-type"] = content_type
         self._body = body
+        self._body_stream = StringIO(self.body)
 
     def get_body(self):
         self.read_entire_message()
@@ -77,8 +80,6 @@
         raise NotImplementedError
 
     def read(self, amount):
-        if self._body_stream is None and self.body_has_been_read():
-            self._body_stream = StringIO(self.body)
         if self._body_stream is not None:
             data = self._body_stream.read(amount)
         else:
@@ -145,28 +146,87 @@
         self.version = version
         self.status = status
         self.reason = reason
+        self.__decompress = False
+        self.started_reading = False
 
     def debug_log_message(self, log_function):
         log_function("HTTPResponseMessage: %s %s %s",
                      self.version, self.status, self.reason)
         super(HTTPResponseMessage, self).debug_log_message(log_function)
 
-    def read_entire_message(self, decompressor = None):
+    def read_entire_message(self):
         """
         Read the entire message and set the messages body.
         If the optional decompressor parameter is given the
         body will be decompressed.
         """
-        if self.body_has_been_read():
+        if not self.body_has_been_read():
+            self.set_body(self.read())
+
+    def do_decompress(self):
+        """
+        Decompress the input stream on read if possible.
+
+        If the content-encoding of the message is either gzip
+        or deflate the Content-Encoding and Content-Length
+        headers will also be removed.
+        """
+        if self.__decompress:
             return
-        length = int(self.headers.get("Content-Length", "0"))
-        if length:
-            # Not using chunked so we read everything at once
-            if decompressor:
-                self.set_body(decompressor.decompress(self.read(length)))
+
+        if not self.headers.get("Content-Encoding"):
+            return
+
+        if self.headers.get("Content-Encoding") == "deflate":
+            self.__decompress = "deflate"
+        elif self.headers.get("Content-Encoding") == "gzip":
+            self.__decompress = "gzip"
+
+        if self.__decompress:
+            # Can decompress the input
+            if self.started_reading:
+                self.__decompress = False
+                raise Exception("do_decompress called after first read")
+
+            del self.headers["Content-Encoding"]
+            if self.headers.get("Content-Length"):
+                del self.headers["Content-Length"]
+
+    def read(self, amount = -1):
+        """
+        Read the message up to amount bytes and return a data string of
+        length amount.
+        If do_decompress was called before this will return the data in
+        a decompressed form if possible.
+        """
+
+        if self.started_reading == False and amount != 0:
+            self.started_reading = True
+
+        if self._body_stream is None:
+            if self.__decompress:
+                # Create the decompression object
+                #
+                # On defate decompression -zlib.MAX_WBITS is given to ensure
+                # that non RFC confirming responses as they are sent by most
+                # http servers are decompressed correctly by ignoring a
+                # possibly invlaid header.
+
+                # To decompress gzip with zlib 16 needs to be added to the
+                # wbits parameter
+
+                # See the documention of inflateInit2 at
+                # http://zlib.net/manual.html
+
+                if self.__decompress == "deflate":
+                    self._body_stream = DecompressStream(self.infile,
+                            decompressobj(-MAX_WBITS))
+                elif self.__decompress == "gzip":
+                    self._body_stream = DecompressStream(self.infile,
+                            decompressobj(16 + MAX_WBITS))
+                else:
+                    raise Exception("Invalid decompress method"
+                            " in HTTPResponse")
             else:
-                self.set_body(self.read(length))
-        elif not self.headers.get("Content-Length"):
-            # Can't read the entire message because we are chunked
-            # FIXME how to handle this?
-            pass
+                self._body_stream = self.infile
+        return self._body_stream.read(amount)

Modified: branches/compression/inteproxy/proxycore.py
===================================================================
--- branches/compression/inteproxy/proxycore.py	2012-02-24 16:04:28 UTC (rev 354)
+++ branches/compression/inteproxy/proxycore.py	2012-02-29 09:57:31 UTC (rev 355)
@@ -330,36 +330,6 @@
             self.send_header(header, value)
         self.end_headers()
 
-    def get_decompress_object(self, response):
-        # Set up the response for decompression
-
-        # On defate decompression -zlib.MAX_WBITS is given to ensure
-        # that non RFC confirming responses as they are sent by most
-        # http servers are decompressed correctly by ignoring a
-        # possibly invlaid header.
-
-        # To decompress gzip with zlib 16 needs to be added to the wbits
-        # parameter
-
-        # See the documention of inflateInit2 at http://zlib.net/manual.html
-        if response.headers.get("Content-Encoding") == "deflate":
-            return zlib.decompressobj(-zlib.MAX_WBITS)
-        elif response.headers.get("Content-Encoding") == "gzip":
-            return zlib.decompressobj(16 + zlib.MAX_WBITS)
-
-    def decompressed_read(self, orig_read, response, amount):
-        # Read <amount> bytes with the orig_read function from 
-        # response and return it decompressed
-        # if self.decompressor is set.
-        # The Return value can be larger then amount
-        raw_data = orig_read(amount)
-        if self.decompressor:
-            if not response.body_has_been_read():
-                return self.decompressor.decompress(raw_data)
-            else:
-                return raw_data
-        return raw_data
-
     def handle_response(self, response):
         # The HTTP version in the reply generated by send_response is
         # taken from self.protocol_version.  We simply set it to
@@ -373,12 +343,7 @@
         do_chunked = response.headers.get("Transfer-encoding") == "chunked"
 
         if do_rewrite or self.should_decompress_response:
-            self.decompressor = self.get_decompress_object(response)
-            if self.decompressor:
-                # Reflect the decompression in the headers
-                del response.headers["Content-Encoding"]
-                if int(response.headers.get("Content-Length", "0")):
-                    response.read_entire_message(self.decompressor)
+            response.do_decompress()
 
         if do_chunked and do_rewrite:
             self.send_headers(response)
@@ -441,7 +406,7 @@
         append = data.append
 
         while True:
-            chunk = self.decompressed_read(read, response, length)
+            chunk = response.read(length)
             if not chunk:
                 break
 
@@ -490,7 +455,7 @@
                 chunk_size = min(length, max_chunk_size)
             else:
                 chunk_size = max_chunk_size
-            chunk = self.decompressed_read(read, response, chunk_size)
+            chunk = response.read(chunk_size)
             if not chunk:
                 break
             if length is not None: