Sophie

Sophie

distrib > Mandriva > 2010.2 > i586 > media > contrib-backports > by-pkgid > df29c83ca401d91ec9c00bfcf7fea4ea > files > 257

shedskin-0.8-2mdv2010.2.i586.rpm

#!/usr/bin/env python
# -*- coding: utf-8 -*-

### to compile this, copy lib/hashlib.* to the shedskin lib dir!

"""
This is a pure Python implementation of the [rsync algorithm](TM96).

[TM96] Andrew Tridgell and Paul Mackerras. The rsync algorithm.
Technical Report TR-CS-96-05, Canberra 0200 ACT, Australia, 1996.
http://samba.anu.edu.au/rsync/.

"""

# taken from: http://code.activestate.com/recipes/577518-rsync-algorithm/

import collections
import hashlib

class Element:
    def __init__(self, index=-1, data=None):
        self.index = index
        self.data = data

def rsyncdelta(datastream, remotesignatures, blocksize):
    """
    Generates a binary patch when supplied with the weak and strong
    hashes from an unpatched target and a readable stream for the
    up-to-date data. The blocksize must be the same as the value
    used to generate remotesignatures.
    """
    remote_weak, remote_strong = remotesignatures

    match = True
    matchblock = -1
    delta = []

    while True:
        if match and datastream is not None:
            # Whenever there is a match or the loop is running for the first
            # time, populate the window using weakchecksum instead of rolling
            # through every single byte which takes at least twice as long.
            window = collections.deque(datastream.read(blocksize))
            checksum, a, b = weakchecksum(''.join(window))

        try:
            # If there are two identical weak checksums in a file, and the
            # matching strong hash does not occur at the first match, it will
            # be missed and the data sent over. May fix eventually, but this
            # problem arises very rarely.
            matchblock = remote_weak.index(checksum, matchblock + 1)
            stronghash = hashlib.md5(''.join(window)).hexdigest()

            if remote_strong[matchblock] == stronghash:
                match = True
                delta.append(Element(index=matchblock))

                if datastream.closed:
                    break
                continue

        except ValueError:
            # The weakchecksum did not match
            match = False
            if datastream:
                # Get the next byte and affix to the window
                newchar = datastream.read(1)
                if newchar:
                    window.append(newchar)
                else:
                    # No more data from the file; the window will slowly shrink.
                    # newchar needs to be zero from here on to keep the checksum
                    # correct.
                    newchar = '\0'
                    tailsize = datastream.tell() % blocksize
                    datastream = None

            if datastream is None and len(window) <= tailsize:
                # The likelihood that any blocks will match after this is
                # nearly nil so call it quits.
                delta.append(Element(data=list(window)))
                break

            # Yank off the extra byte and calculate the new window checksum
            oldchar = window.popleft()
            checksum, a, b = rollingchecksum(oldchar, newchar, a, b, blocksize)

            # Add the old byte the file delta. This is data that was not found
            # inside of a matching block so it needs to be sent to the target.
            if delta:
                delta[-1].data.append(oldchar)
            else:
                delta.append(Element(data=[oldchar]))

    return delta


def blockchecksums(instream, blocksize):
    """
    Returns a list of weak and strong hashes for each block of the
    defined size for the given data stream.
    """
    weakhashes = list()
    stronghashes = list()
    read = instream.read(blocksize)

    while read:
        weakhashes.append(weakchecksum(read)[0])
        stronghashes.append(hashlib.md5(read).hexdigest())
        read = instream.read(blocksize)

    return weakhashes, stronghashes


def patchstream(instream, outstream, delta, blocksize):
    """
    Patches instream using the supplied delta and write the resultant
    data to outstream.
    """
    for element in delta:
        if element.index != -1:
            instream.seek(element.index * blocksize)
            data = instream.read(blocksize)
        else:
            data = ''.join(element.data)
        outstream.write(data)


def rollingchecksum(removed, new, a, b, blocksize):
    """
    Generates a new weak checksum when supplied with the internal state
    of the checksum calculation for the previous window, the removed
    byte, and the added byte.
    """
    a -= ord(removed) - ord(new)
    b -= ord(removed) * blocksize - a
    return (b << 16) | a, a, b


def weakchecksum(data):
    """
    Generates a weak checksum from an iterable set of bytes.
    """
    a = b = 0
    l = len(data)

    for i in range(l):
        n = ord(data[i])
        a += n
        b += (l - i)*n

    return (b << 16) | a, a, b

if __name__ == '__main__':
    blocksize = 4096

    # On the system containing the file that needs to be patched
    unpatched = open("testdata/unpatched.file", "rb")
    hashes = blockchecksums(unpatched, blocksize)

    # On the remote system after having received `hashes`
    patchedfile = open("testdata/patched.file", "rb")
    delta = rsyncdelta(patchedfile, hashes, blocksize)

    # System with the unpatched file after receiving `delta`
    unpatched.seek(0)
    save_to = open("testdata/locally-patched.file", "wb")
    patchstream(unpatched, save_to, delta, blocksize)