Rsync is a powerful file synchronization tool known for its efficient differential transfer algorithm. In addition to the command-line tool, rsync also provides a programming interface that allows developers to integrate file synchronization features into their applications.
Core Principles of Rsync
The core advantage of rsync lies in its differential transfer algorithm. It achieves efficient synchronization through the following steps:
- File Chunking: Dividing files into fixed-size chunks
- Checksum Calculation: Calculating weak checksums (rolling checksum) and strong checksums (MD5) for each chunk
- Difference Detection: Comparing checksums of the source and target files to identify changed chunks
- Incremental Transfer: Transmitting only the changed chunks
Using the Librsync Library
Librsync is a C language implementation of the rsync algorithm, providing a complete programming interface.
Installing Librsync
On Ubuntu/Debian systems:
sudo apt-get install librsync-dev
On CentOS/RHEL systems:
sudo yum install librsync-devel
Basic Usage Example
The following is a simple C program demonstrating how to use the librsync library to compute file signatures and generate differences:
#include <stdio.h>
#include <stdlib.h>
#include <librsync.h>
// Error handling function
void handle_error(rs_result result, const char *msg) {
if (result != RS_DONE) {
fprintf(stderr, "%s: %s\n", msg, rs_strerror(result));
exit(1);
}
}
// Generate file signature
void generate_signature(const char *basis_file, const char *sig_file) {
FILE *basis, *sig;
rs_stats_t stats;
rs_result result;
// Open files
basis = fopen(basis_file, "rb");
sig = fopen(sig_file, "wb");
if (!basis || !sig) {
perror("Unable to open file");
exit(1);
}
// Generate signature
result = rs_sig_file(basis, sig, RS_DEFAULT_BLOCK_LEN,
RS_DEFAULT_STRONG_LEN, &stats);
fclose(basis);
fclose(sig);
handle_error(result, "Failed to generate signature");
printf("Signature generated successfully: %s -> %s\n", basis_file, sig_file);
}
// Generate delta file
void generate_delta(const char *sig_file, const char *new_file, const char *delta_file) {
FILE *sig, *new, *delta;
rs_signature_t *sumset;
rs_result result;
rs_stats_t stats;
// Open files
sig = fopen(sig_file, "rb");
new = fopen(new_file, "rb");
delta = fopen(delta_file, "wb");
if (!sig || !new || !delta) {
perror("Unable to open file");
exit(1);
}
// Load signature
result = rs_loadsig_file(sig, &sumset, &stats);
handle_error(result, "Failed to load signature");
fclose(sig);
// Generate delta
result = rs_delta_file(sumset, new, delta, &stats);
rs_free_sumset(sumset);
fclose(new);
fclose(delta);
handle_error(result, "Failed to generate delta");
printf("Delta generated successfully: %s + %s -> %s\n", sig_file, new_file, delta_file);
}
// Apply delta patch file
void apply_patch(const char *basis_file, const char *delta_file, const char *new_file) {
FILE *basis, *delta, *new;
rs_result result;
rs_stats_t stats;
// Open files
basis = fopen(basis_file, "rb");
delta = fopen(delta_file, "rb");
new = fopen(new_file, "wb");
if (!basis || !delta || !new) {
perror("Unable to open file");
exit(1);
}
// Apply delta
result = rs_patch_file(basis, delta, new, &stats);
fclose(basis);
fclose(delta);
fclose(new);
handle_error(result, "Failed to apply delta");
printf("Patch applied successfully: %s + %s -> %s\n", basis_file, delta_file, new_file);
}
int main() {
const char *old_file = "old_file.txt";
const char *new_file = "new_file.txt";
const char *sig_file = "file.sig";
const char *delta_file = "file.delta";
const char *reconstructed_file = "reconstructed.txt";
// Create test files
FILE *f1 = fopen(old_file, "w");
FILE *f2 = fopen(new_file, "w");
if (f1 && f2) {
fprintf(f1, "This is the content of a test file. This is the old version of the file.\n");
fprintf(f2, "This is the content of a test file. This is the new version of the file, with some modifications.\n");
fclose(f1);
fclose(f2);
}
printf("Starting rsync demonstration...\n");
// Generate signature
generate_signature(old_file, sig_file);
// Generate delta
generate_delta(sig_file, new_file, delta_file);
// Apply delta to reconstruct file
apply_patch(old_file, delta_file, reconstructed_file);
printf("Rsync demonstration completed!\n");
return 0;
}
Compilation command:
gcc -o rsync_demo rsync_demo.c -lrsync
Python Rsync Implementation
Although there is no official rsync library for Python, we can use subprocess to call the rsync command or use third-party libraries like pyrsync.
Using Subprocess to Call Rsync Command
import subprocess
import os
def rsync_sync(source, destination, verbose=False, delete=False):
"""
Synchronize files using rsync
Args:
source: Source path
destination: Destination path
verbose: Whether to display detailed information
delete: Whether to delete files in the destination that do not exist in the source
"""
cmd = ['rsync', '-av']
if verbose:
cmd.append('-v')
if delete:
cmd.append('--delete')
cmd.extend([source, destination])
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
print("Synchronization successful!")
if verbose:
print("Output:", result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"Synchronization failed: {e}")
print("Error output:", e.stderr)
return False
def rsync_dry_run(source, destination):
"""
Perform a dry run of rsync (without actually transferring files)
"""
cmd = ['rsync', '-av', '--dry-run', source, destination]
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
print("Dry run results:")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"Dry run failed: {e}")
return False
# Usage example
if __name__ == "__main__":
# Create test directories and files
os.makedirs("test_source", exist_ok=True)
os.makedirs("test_dest", exist_ok=True)
with open("test_source/file1.txt", "w") as f:
f.write("Content of test file 1")
with open("test_source/file2.txt", "w") as f:
f.write("Content of test file 2")
print("Executing dry run:")
rsync_dry_run("test_source/", "test_dest/")
print("\nExecuting actual synchronization:")
rsync_sync("test_source/", "test_dest/", verbose=True)
Using Pyrsync Library
First, install pyrsync:
pip install pyrsync
Example code:
import os
from pyrsync import (blockchecksums, rsyncdelta, patchstream,
undelta, create_delta, apply_delta)
def demo_pyrsync():
"""Demonstrate basic usage of pyrsync"""
# Create test data
original_data = b"This is the content of the original file, containing some text data for demonstration."
modified_data = b"This is the modified file content, containing some updated text data for demonstration."
print("Original data length:", len(original_data))
print("Modified data length:", len(modified_data))
# Calculate block checksums for original data
block_size = 8
checksums = list(blockchecksums(original_data, block_size))
print(f"\nGenerated {len(checksums)} block checksums")
# Calculate differences
delta = list(rsyncdelta(modified_data, checksums))
print(f"\nGenerated {len(delta)} difference instructions")
# Display difference instructions
for i, instruction in enumerate(delta):
if hasattr(instruction, 'data'):
print(f"Instruction {i}: Data block, length {len(instruction.data)}")
else:
print(f"Instruction {i}: Reference block, position {instruction[0]} length {instruction[1]}")
# Apply differences to reconstruct data
reconstructed = undelta(delta, original_data)
print(f"\nReconstruction verification: {'Success' if reconstructed == modified_data else 'Failure'}")
print("Reconstructed data:", reconstructed.decode('utf-8'))
if __name__ == "__main__":
demo_pyrsync()
Advanced Application: Implementing a Simple File Synchronization Service
The following is a more complete example demonstrating how to implement a simple file synchronization service based on the principles of rsync:
import os
import hashlib
import struct
from typing import List, Tuple, Dict
class SimpleRsync:
def __init__(self, block_size=1024):
self.block_size = block_size
def compute_weak_checksum(self, data: bytes) -> int:
"""Calculate weak checksum (simple rolling checksum)"""
a = b = 0
for i, byte in enumerate(data):
a = (a + byte) % 65536
b = (b + (len(data) - i) * byte) % 65536
return (b << 16) | a
def compute_strong_checksum(self, data: bytes) -> str:
"""Calculate strong checksum (MD5)"""
return hashlib.md5(data).hexdigest()
def compute_file_signature(self, filepath: str) -> List[Tuple[int, str, int]]:
"""Calculate file signature (checksums of all blocks)"""
signature = []
with open(filepath, 'rb') as f:
block_index = 0
while True:
block = f.read(self.block_size)
if not block:
break
weak_csum = self.compute_weak_checksum(block)
strong_csum = self.compute_strong_checksum(block)
signature.append((weak_csum, strong_csum, block_index))
block_index += 1
return signature
def find_matching_blocks(self, signature: List[Tuple[int, str, int]],
filepath: str) -> List[Tuple[int, int]]:
"""Find matching blocks in the file"""
matches = []
weak_dict = {weak: (strong, idx) for weak, strong, idx in signature}
with open(filepath, 'rb') as f:
current_pos = 0
window = f.read(self.block_size)
while window:
weak_csum = self.compute_weak_checksum(window)
if weak_csum in weak_dict:
strong_csum_stored, block_idx = weak_dict[weak_csum]
strong_csum_current = self.compute_strong_checksum(window)
if strong_csum_current == strong_csum_stored:
matches.append((block_idx, current_pos))
# Sliding window (simplified, should use rolling checksum in practice)
next_byte = f.read(1)
if not next_byte:
break
window = window[1:] + next_byte
current_pos += 1
return matches
def generate_sync_instructions(self, basis_file: str, new_file: str) -> Dict:
"""Generate synchronization instructions"""
basis_sig = self.compute_file_signature(basis_file)
matches = self.find_matching_blocks(basis_sig, new_file)
# Read all data from the new file
with open(new_file, 'rb') as f:
new_data = f.read()
instructions = {
'block_size': self.block_size,
'basis_signature': basis_sig,
'matches': matches,
'new_file_size': len(new_data)
}
return instructions
# Usage example
if __name__ == "__main__":
# Create test files
with open("basis.txt", "w") as f:
f.write("This is the content of the basis file, containing some shared text data.")
with open("new_version.txt", "w") as f:
f.write("This is the new version file, containing basis content and some new modifications.")
# Use custom rsync implementation
rsync = SimpleRsync(block_size=8)
instructions = rsync.generate_sync_instructions("basis.txt", "new_version.txt")
print("Synchronization instructions:")
print(f"Block size: {instructions['block_size']}")
print(f"Number of matching blocks: {len(instructions['matches'])}")
print(f"New file size: {instructions['new_file_size']}")
for match in instructions['matches']:
print(f"Basis file block {match[0]} matches new file position {match[1]}")
Conclusion
The rsync library provides powerful file synchronization capabilities, significantly reducing network transfer volume and synchronization time through its efficient differential transfer algorithm. Whether using the C language librsync library or calling the rsync command through Python or using third-party libraries, developers can integrate the powerful features of rsync into their applications.
Key advantages:
- Efficient Transfer: Only transfer changed file blocks
- Bandwidth Optimization: Significantly reduce network transfer volume
- Reliability: Supports resuming interrupted transfers and integrity verification
- Flexibility: Supports various synchronization modes and filtering rules
In practical applications, rsync is particularly suitable for backup systems, file distribution, continuous integration, and is an important tool that every developer should be familiar with.