Re: [PATCH] pack-objects: re-validate data we copy from elsewhere.

From: Junio C Hamano <junkio@cox.net>
Date: 2006-09-04 14:06:59
Junio C Hamano <junkio@cox.net> writes:

> Linus Torvalds <torvalds@osdl.org> writes:
>
>> Ok. Is it less painful if it just checks the zlib CRC...
>
> I haven't checked myself but somebody said that zlib CRC is of
> preimage so we would need to incur inflate cost anyway if that
> is the case.  But I think it may be a reasonable comproise to
> assume that an existing delta that inflates properly would apply
> to its base object, and if we can assume that we do not have to
> check the inflated xdelta data.
> ...
> Another thing the current check is _not_ doing is for this
> pathological case:
>
>  - .idx table says the pack entry is N bytes
>
>  - unpack_entry_gently() used in the revalidate code uses the
>    usual codepath that says "here is the start of the pack
>    entry; inflate using as much data as you need"; .idx is
>    somehow wrong and it needed N+M bytes where 0 < M.
>
>  - we copy out N bytes because we belive .idx.

So here is a patch against "master" which contains none of the
earlier patches in this thread.

When copying from an existing pack and when copying from a loose
object with new style header, the code makes sure that the piece
we are going to copy out inflates well and inflate() consumes
the data in full while doing so.

The check to see if the xdelta really apply is quite expensive
as you described, because you would need to have the image of
the base object which can be represented as a delta against
something else.

The same repack takes this much with this code.

Total 301361, written 301361 (delta 238935), reused 300995 (delta 238663)
6.66user 0.67system 0:07.40elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+62728minor)pagefaults 0swaps

---
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 46f524d..149fa28 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -65,6 +65,7 @@ static unsigned char pack_file_sha1[20];
 static int progress = 1;
 static volatile sig_atomic_t progress_update;
 static int window = 10;
+static int pack_to_stdout;
 
 /*
  * The object names in objects array are hashed with this hashtable,
@@ -242,6 +243,82 @@ static int encode_header(enum object_typ
 	return n;
 }
 
+static int check_inflate(unsigned char *data, unsigned long len, unsigned long expect)
+{
+	z_stream stream;
+	unsigned char fakebuf[4096];
+	int st;
+
+	memset(&stream, 0, sizeof(stream));
+	stream.next_in = data;
+	stream.avail_in = len;
+	stream.next_out = fakebuf;
+	stream.avail_out = sizeof(fakebuf);
+	inflateInit(&stream);
+
+	while (1) {
+		st = inflate(&stream, Z_FINISH);
+		if (st == Z_STREAM_END || st == Z_OK) {
+			st = (stream.total_out == expect &&
+			      stream.total_in == len) ? 0 : -1;
+			break;
+		}
+		if (st != Z_BUF_ERROR) {
+			st = -1;
+			break;
+		}
+		stream.next_out = fakebuf;
+		stream.avail_out = sizeof(fakebuf);
+	}
+	inflateEnd(&stream);
+	return st;
+}
+
+/*
+ * we are going to reuse the existing pack entry data.  make
+ * sure it is not corrupt.
+ */
+static int revalidate_pack_entry(struct object_entry *entry, unsigned char *data, unsigned long len)
+{
+	enum object_type type;
+	unsigned long size, used;
+
+	if (pack_to_stdout)
+		return 0;
+
+	/* the caller has already called use_packed_git() for us,
+	 * so it is safe to access the pack data from mmapped location.
+	 * make sure the entry inflates correctly.
+	 */
+	used = unpack_object_header_gently(data, len, &type, &size);
+	if (!used)
+		return -1;
+	if (type == OBJ_DELTA)
+		used += 20; /* skip base object name */
+	data += used;
+	len -= used;
+	return check_inflate(data, len, entry->size);
+}
+
+static int revalidate_loose_object(struct object_entry *entry,
+				   unsigned char *map,
+				   unsigned long mapsize)
+{
+	/* we already know this is a loose object with new type header. */
+	enum object_type type;
+	unsigned long size, used;
+
+	if (pack_to_stdout)
+		return 0;
+
+	used = unpack_object_header_gently(map, mapsize, &type, &size);
+	if (!used)
+		return -1;
+	map += used;
+	mapsize -= used;
+	return check_inflate(map, mapsize, size);
+}
+
 static unsigned long write_object(struct sha1file *f,
 				  struct object_entry *entry)
 {
@@ -276,6 +353,9 @@ static unsigned long write_object(struct
 		map = map_sha1_file(entry->sha1, &mapsize);
 		if (map && !legacy_loose_object(map)) {
 			/* We can copy straight into the pack file */
+			if (revalidate_loose_object(entry, map, mapsize))
+				die("corrupt loose object %s",
+				    sha1_to_hex(entry->sha1));
 			sha1write(f, map, mapsize);
 			munmap(map, mapsize);
 			written++;
@@ -286,7 +366,7 @@ static unsigned long write_object(struct
 			munmap(map, mapsize);
 	}
 
-	if (! to_reuse) {
+	if (!to_reuse) {
 		buf = read_sha1_file(entry->sha1, type, &size);
 		if (!buf)
 			die("unable to read %s", sha1_to_hex(entry->sha1));
@@ -319,6 +399,9 @@ static unsigned long write_object(struct
 
 		datalen = find_packed_object_size(p, entry->in_pack_offset);
 		buf = (char *) p->pack_base + entry->in_pack_offset;
+
+		if (revalidate_pack_entry(entry, buf, datalen))
+			die("corrupt delta in pack %s", sha1_to_hex(entry->sha1));
 		sha1write(f, buf, datalen);
 		unuse_packed_git(p);
 		hdrlen = 0; /* not really */
@@ -1163,7 +1246,7 @@ static void prepare_pack(int window, int
 		find_deltas(sorted_by_type, window+1, depth);
 }
 
-static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout)
+static int reuse_cached_pack(unsigned char *sha1)
 {
 	static const char cache[] = "pack-cache/pack-%s.%s";
 	char *cached_pack, *cached_idx;
@@ -1247,7 +1330,7 @@ int cmd_pack_objects(int argc, const cha
 {
 	SHA_CTX ctx;
 	char line[40 + 1 + PATH_MAX + 2];
-	int depth = 10, pack_to_stdout = 0;
+	int depth = 10;
 	struct object_entry **list;
 	int num_preferred_base = 0;
 	int i;
@@ -1367,7 +1450,7 @@ int cmd_pack_objects(int argc, const cha
 	if (progress && (nr_objects != nr_result))
 		fprintf(stderr, "Result has %d objects.\n", nr_result);
 
-	if (reuse_cached_pack(object_list_sha1, pack_to_stdout))
+	if (reuse_cached_pack(object_list_sha1))
 		;
 	else {
 		if (nr_result)


-- 
VGER BF report: U 0.976258
-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Received on Mon Sep 04 14:07:35 2006

This archive was generated by hypermail 2.1.8 : 2006-09-04 14:08:11 EST