#!/usr/bin/env ruby # Usage: shpatch.rb --help require 'md5' require 'ostruct' require 'optparse' $config = OpenStruct.new $config.command = :PATCH $config.same_base = false $config.same_ext = true $config.same_mime = false $config.changed_content = true $config.max_removed = 25 # 0 .. 100 $config.max_added = 50 $config.verbose = false # Default dirglobs to ignore ignore_globs = [ "BitKeeper", "PENDING", "SCCS", "CVS", "*.state", "*.o", "*.a", "*.so", "*~", "#*#", "*.orig", "*.dll" ] # Option parsing $opts = OptionParser.new $opts.banner = %Q{\ Generate a shellpatch file, or perform the patch in a shellpatch file. A shellpatch file is a patch file which contains shell-commands including 'mv' and 'patch'. Determining the renames uses a lot of heuristics and a brute-force approach; your milage may vary. All trivial file renames are handled by comparing the complete contents. All remaining files (the list of added and removed files) in then searched through to find matching pairs: this is quite costly A cache of md5 sums is kept at the root of the repositories to make finding differences fast. (c)2005 R. Nijlunsing License: GPLv2 Usage: shpatch [options] Defaults options are within [brackets]. } $opts.separator("Diff options") $opts.on("-d", "--diff PATH1,PATH2", Array, "Generate a shellpatch of the diff", "between two directories") { |paths| if paths.size != 2 raise Exception.new("Need two directories for --diff") end $config.command = :DIFF $config.paths = paths } $opts.separator("Diff options for heuristics to finding renames with changed content") $opts.on("--[no-]changed-content", "Find renames with changed content [#{$config.changed_content}]" ) { |cc| $config.changed_content = cc } $opts.on("--[no-]same-base", "Rename only to files with same basename [#{$config.same_base}]") { |sb| $config.same_base = sb } $opts.on("--[no-]same-ext", "Rename only to same extention [#{$config.same_ext}]") { |se| $config.same_ext = se } $opts.on("--[no-]same-mime", "Rename only to same mimetype [#{$config.same_mime}]") { |sm| $config.same_mime = sm } $opts.on("--max-removed PERC", String, "Max. percentage of source file which may", "be removed while still being considered", "a rename [#{$config.max_removed}]" ) { |perc| $config.max_removed = perc.to_i } $opts.on("--max-added PERC", String, "Max. percentage of destination file which may", "be added while still being considered", "a rename [#{$config.max_added}]" ) { |perc| $config.max_added = perc.to_i } $opts.separator("Options to add to current patch") $opts.on("--mv SOURCE DEST", String, String, "Adds a rename to the current patch", "and perform the rename") { |path1, path2| $config.command = :MV $config.paths = [path1, path2] } $opts.separator("General options") $opts.on("--[no-]verbose", "-v", "Be more verbose") { |v| $config.verbose = v } $opts.on("--help", "-h", "This usage") { puts $opts; exit 1 } %Q{ Examples: shpatch.rb --diff linux-2.6.8,linux-2.6.9 --max-removed 10 Generate a shellpatch with renames from directories linux-2.6.8 to linux-2.6.9 . At most 10% of a file may be removed between versions, otherwise they are considered different. }.split("\n").each { |line| $opts.separator(line) } begin $opts.parse!(ARGV) rescue Exception puts "#{$opts}\n!!! #{$!}" exit 1 end module Shell # Escape string string so that it is parsed to the string itself # E.g. Shell.escapeString("what's in a name") = "what\'s\ in\ a\ name" # Compare to Regexp.escape def Shell.escape(string) string.gsub(%r{([^-._0-9a-zA-Z/])}i, '\\\\\1') end end # One hunk in the patch class RenameHunk attr_accessor :from, :to # Strings: pathname from and to def initialize(from, to) # puts "# Found a rename: #{Shell.escape(from)} -> #{Shell.escape(to)}" @from = from; @to = to end def command; "mv"; end def to_s; "#{command} #{Shell.escape(@from)} #{Shell.escape(@to)}"; end def execute(repo) File.rename("#{repo.root}/#@from", "#{repo.root}/#@to") end end class DeleteHunk attr_accessor :pathname def initialize(pathname); @pathname = pathname; end def command; "rm"; end def to_s; "#{command} #{Shell.escape(@pathname)}"; end def execute(repo); File.delete("#{repo.root}/#@pathname"); end end class PatchHunk attr_accessor :from, :to, :contents def initialize(repo1, from, repo2, to) @from = from; @to = to end def command; "patch"; end def to_s long_from = Shell.escape((from[0] == ?/ ? "" : repo1.root + "/") + from) long_to = Shell.escape((to[0] == ?/ ? "" : repo2.root + "/") + to) puts "# Diffing #{long_from} -> #{long_to}" if $config.verbose @contents = File.popen("diff --unified #{long_from} #{long_to}") { |io| io.read } mark = "_SHPATCHMARK_" # Make mark unique mark += rand(10).to_s while @contents.index(mark) "#{command} <<#{mark}\n#{@contents}#{mark}" end end # A filesystem as backing store class FileSystem SHPATCHSTATE_FILE = ".shpatch.state" SHPATCHSTATE_VERSION_STRING = "shpatch.rb state version 20050418-2" attr_accessor :root attr_accessor :cache_file # String: filename with signatures attr_accessor :signature_cache # From Fixnum inode to Array [mtime, sig] attr_accessor :signature_cache_changed # Boolean # Reads the cache. When not readable in current directory, go # up a level ('..') def read_signatures @signature_cache = {} @signature_cache_changed = false @cache_file = File.expand_path("#@root/#{SHPATCHSTATE_FILE}") cache_file = @cache_file loop { if FileTest.readable?(cache_file) File.open(cache_file, "rb") do |file| version_string = file.readline.chomp if version_string == SHPATCHSTATE_VERSION_STRING begin @signature_cache = Marshal.load(file) puts "# Read signature cache with #{@signature_cache.size} signatures from #{cache_file.inspect}" if $config.verbose @cache_file = cache_file break rescue ArgumentError, EOFError puts "# (error reading state file: rebuilding file...)" if $config.verbose end end end end parent_cache_file = File.expand_path( File.dirname(cache_file) + "/../" + File.basename(cache_file) ) break if parent_cache_file == cache_file cache_file = parent_cache_file } end def initialize(root) raise "#{root.inspect} does not exist" if not File.exists?(root) @root = root read_signatures end def save_signatures # Save all unsaved signature cache return if !@signature_cache_changed puts "# Saving #{@signature_cache.size} signatures..." if $config.verbose pf = @cache_file File.open("#{pf}.new", "wb+") do |file| file.puts SHPATCHSTATE_VERSION_STRING Marshal.dump(@signature_cache, file) File.rename("#{pf}.new", pf) end end # Returns array of [mtime, one-line signature-string] def signature(stat, filename) signature = nil key = [stat.dev, stat.ino] cache = @signature_cache[key] if cache and (cache[0] == stat.mtime) signature = cache[1] else if $config.verbose why = (cache ? "#{(stat.mtime - cache[0]).to_i}s out of date" : "not indexed") puts "# Creating signature for #{filename.inspect} (#{why})" end signature = MD5.new(File.read(filename)).digest @signature_cache[key] = [stat.mtime, signature] @signature_cache_changed = true end signature end def signature_from(prefix, res, from, ignoreRe) Dir.new("#{prefix}#{from}").entries.each { |elem| next if (elem == ".") or (elem == "..") fullname = "#{prefix}#{from}/#{elem}" if not fullname =~ ignoreRe stat = File.stat(fullname) if stat.directory? signature_from(prefix, res, "#{from}/#{elem}", ignoreRe) else rel_filename = "#{from}/#{elem}"[1..-1] res[rel_filename] = signature(stat, fullname) end end } end # Returns all filenames within this filesystem with all signatures def signatures(ignoreRe) res = {} prefix = File.expand_path(@root) signature_from(prefix, res, "", ignoreRe) save_signatures res end def mime_type(filename) path = @root + "/" + filename ($mime_cache ||= {})[path] ||= File.popen("file --mime #{Shell.escape(path)}") { |io| io.read }. gsub(%r{^.*:}, "").strip end # Read the contents of a file def read(filename); File.read(@root + "/" + filename); end end patch = [] dir1, dir2 = $config.paths repo1 = FileSystem.new(dir1) repo2 = FileSystem.new(dir2) def re_from_globs(globs) Regexp.new( "(\\A|/)(" + globs.collect { |glob| Regexp.escape(glob).gsub("\\*", "[^/]*") }.join("|") + ")$" ) end ignore_globs += ["BitKeeper/etc/ignore", ".cvsignore"].collect { |a| ["#{dir1}/#{a}", "#{dir2}/#{a}"] }.flatten.find_all { |f| File.exists?(f) }.collect { |f| File.readlines(f).collect { |line| line.chomp } }.flatten ignore_globs = ignore_globs.uniq.sort ignoreRe = re_from_globs(ignore_globs) puts "# Retrieving signatures of #{dir1.inspect}" if $config.verbose file2sig1 = repo1.signatures(ignoreRe) puts "# Retrieving signatures of #{dir2.inspect}" if $config.verbose file2sig2 = repo2.signatures(ignoreRe) files1 = file2sig1.keys.sort files2 = file2sig2.keys.sort common_files = files1 - (files1 - files2) # Different hash, same filename: patch common_files.each { |fname| if file2sig1[fname] != file2sig2[fname] patch << PatchHunk.new(repo1, fname, repo2, fname) end file2sig1.delete(fname) file2sig2.delete(fname) } # Same hash, different filename: rename sig2file1 = file2sig1.invert sig2file2 = file2sig2.invert sigs1 = sig2file1.keys sigs2 = sig2file2.keys common_sigs = sigs1 - (sigs1 - sigs2) common_sigs.each { |sig| from = sig2file1[sig] to = sig2file2[sig] patch << RenameHunk.new(from, to) sig2file1.delete(sig) sig2file2.delete(sig) file2sig1.delete(from) file2sig2.delete(to) } # statistics of contents of a file. Used for quick-compare class FileContentStats attr_accessor :size # Size of file in lines attr_accessor :lines # Hash from String to Fixnum # Counter number of lines removed and added as a percentage # of the total file length. These are a measure for the degree # of matching between the files. def diff_match(other) added = 0 removed = 0 @lines.each_pair { |line, count| delta = other.lines[line] - count if delta > 0 added += delta else removed += -delta end } other.lines.each_pair { |line, count| added += count if not @lines[line] } [added * 100 / other.size, removed * 100 / self.size] end def initialize(repo, path) @lines = Hash.new(0) size = 0 repo.read(path).delete("\0").each_line { |line| @lines[line.intern] += 1 size += 1 } @size = size end def self.cached(repo, path) @@cache ||= {} @@cache[[repo, path]] ||= self.new(repo, path) end end # Categorize a file based on filename and/or contents def pool_type(repo, path) res = [] res << File.basename(path) if $config.same_base res << File.extname(path) if $config.same_ext res << repo.mime_type(path) if $config.same_mime res end # Determine how much a filename looks like another filename # by splitting the filenames into words. Then count the # words which are the same. def path_correlation(path1, path2) comp1 = path1.split(%r{[-._/]}) comp2 = path2.split(%r{[-._/]}) (comp1 - (comp1 - comp2)).size end class Array # The inverse of an array is an hash from contents to index number. def inverse; res = {}; each_with_index { |e, idx| res[e] = idx }; res; end end if $config.changed_content files1 = file2sig1.keys.sort files2 = file2sig2.keys.sort all_added_files = files2 - files1 all_removed_files = files1 - files2 pools = {} # Group files into 'pools' all_removed_files.each { |removed_file| (pools[pool_type(repo1, removed_file)] ||= [[], []])[0] << removed_file } all_added_files.each { |added_file| (pools[pool_type(repo2, added_file)] ||= [[], []])[1] << added_file } pools.each_pair { |key, pool| removed_files, added_files = *pool if $config.verbose and not removed_files.empty? and not added_files.empty? puts "# Comparing pool type #{key.inspect} with #{pool[0].size}x#{pool[1].size} filepairs" end # Determine how 'special' or 'specific' a word is. We start with # filenames containing special words. words = {} # Group files by 'words' removed_files.each { |removed_file| removed_file.split(%r{[-._/]+}).uniq.each { |word| words[word] ||= [[], []] words[word][0] << removed_file } } added_files.each { |added_file| added_file.split(%r{[-._/]+}).uniq.each { |word| words[word] ||= [[], []] words[word][1] << added_file } } word_importance = words.keys.find_all { |word| (words[word][0].size * words[word][1].size) > 0 }.sort_by { |word| words[word][0].size * words[word][1].size }.reverse # p word_importance word_importance = word_importance.inverse word_importance.default = 0 removed_files.sort_by { |removed_file| removed_file.split(%r{[-._/]+}).uniq.inject(0) { |s, e| [s, word_importance[e]].max } }.reverse.each { |removed_file| # puts removed_file removed_file_stats = FileContentStats.new(repo1, removed_file) added_files.sort_by { |f| -path_correlation(removed_file, f) }. each { |added_file| added_file_stats = FileContentStats.cached(repo2, added_file) removed_size = removed_file_stats.size added_size = added_file_stats.size min_added = (added_size - removed_size) * 100 / added_size next if min_added > $config.max_added min_removed = (removed_size - added_size) * 100 / removed_size next if min_removed > $config.max_removed # Calculate added & removed percentages added, removed = removed_file_stats.diff_match(added_file_stats) if (added <= $config.max_added) && (removed <= $config.max_removed) # We found a rename-match! puts "+%2i%% -%2i%% #{removed_file} -> #{added_file}" % [added, removed] #if $config.verbose patch << RenameHunk.new(removed_file, added_file) # Don't match again against this added file: added_files -= [added_file] all_added_files -= [added_file] all_removed_files -= [removed_file] patch << PatchHunk.new(repo1, removed_file, repo2, added_file) break end } } } end all_added_files.each { |added_file| patch << PatchHunk.new(repo1, "/dev/null", repo2, added_file) } all_removed_files.each { |removed_file| patch << PatchHunk.new(repo1, removed_file, repo2, "/dev/null") } #patch.each { |hunk| puts hunk.to_s }