From 2903cddb58f6ee99116e0751a2305f75f9a86461 Mon Sep 17 00:00:00 2001 From: Tianjie Xu Date: Fri, 18 Aug 2017 18:15:47 -0700 Subject: Improve imgdiff for large zip files Due to the cache size limit for OTA generation, we used to split large zip files linearly into pieces and do bsdiff on them. As a result, i) we lose the advantage of imgdiff; ii) if there's an accidental order change of some huge files inside the zip, we'll create an insanely large patch. This patch splits the src&tgt more smartly based on the zip entry_name. If the entry_name is empty or no matching source is found for a target chunk, we'll skip adding its source and later do a bsdiff against the whole split source image (this rarely happens in our use cases except for the metadata inside a ziparchive). After the split, the target pieces are continuous and block aligned, while the sources pieces are mutually exclusive. (Some of the source blocks may not be used if there's no matching entry_name in the target.) Then we will generate patches accordingly between each split image pairs. Afterwards, if we apply imgpatch to each pair of split source/target images and add up the patched result, we can get back the original target image. For example: Input: [src_image, tgt_image] Split: [src-0,tgt-0; src-1,tgt-1, src-2,tgt-2] Diff: [ patch-0; patch-1; patch-2] Patch: [(src-0,patch-0)=tgt-0; (src-1,patch-1)=tgt-1; (src-2,patch-2)=tgt-2;] Append: [tgt-0 + tgt-1 + tgt-2 = tgt_image] Peformance: For the small package in b/34220646, we decrease the patch size of chrome.apk dramatically from 30M to 400K due to the order change of two big .so files. On two versions of angler, I also observe decent patch size decrease. For chrome.apk, we reduced the size from 5.9M to 3.2M; and for vevlet.apk from 8.0M to 6.5M. Bug: 34220646 Test: recovery component test && apply imgdiff & imgpatch on two chrome.apk Change-Id: I145d802984fa805efbbac9d01a2e64d82ef9728b --- applypatch/imgdiff.cpp | 441 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 423 insertions(+), 18 deletions(-) (limited to 'applypatch/imgdiff.cpp') diff --git a/applypatch/imgdiff.cpp b/applypatch/imgdiff.cpp index 59b600713..2eb618fbf 100644 --- a/applypatch/imgdiff.cpp +++ b/applypatch/imgdiff.cpp @@ -125,6 +125,7 @@ #include #include +#include #include #include #include @@ -139,16 +140,19 @@ #include #include #include +#include #include #include #include #include #include "applypatch/imgdiff_image.h" +#include "rangeset.h" using android::base::get_unaligned; -static constexpr auto BUFFER_SIZE = 0x8000; +static constexpr size_t BLOCK_SIZE = 4096; +static constexpr size_t BUFFER_SIZE = 0x8000; // If we use this function to write the offset and length (type size_t), their values should not // exceed 2^63; because the signed bit will be casted away. @@ -162,6 +166,67 @@ static inline bool Write4(int fd, int32_t value) { return android::base::WriteFully(fd, &value, sizeof(int32_t)); } +// Trim the head or tail to align with the block size. Return false if the chunk has nothing left +// after alignment. +static bool AlignHead(size_t* start, size_t* length) { + size_t residual = (*start % BLOCK_SIZE == 0) ? 0 : BLOCK_SIZE - *start % BLOCK_SIZE; + + if (*length <= residual) { + *length = 0; + return false; + } + + // Trim the data in the beginning. + *start += residual; + *length -= residual; + return true; +} + +static bool AlignTail(size_t* start, size_t* length) { + size_t residual = (*start + *length) % BLOCK_SIZE; + if (*length <= residual) { + *length = 0; + return false; + } + + // Trim the data in the end. + *length -= residual; + return true; +} + +// Remove the used blocks from the source chunk to make sure the source ranges are mutually +// exclusive after split. Return false if we fail to get the non-overlapped ranges. In such +// a case, we'll skip the entire source chunk. +static bool RemoveUsedBlocks(size_t* start, size_t* length, const SortedRangeSet& used_ranges) { + if (!used_ranges.Overlaps(*start, *length)) { + return true; + } + + // TODO find the largest non-overlap chunk. + printf("Removing block %s from %zu - %zu\n", used_ranges.ToString().c_str(), *start, + *start + *length - 1); + + // If there's no duplicate entry name, we should only overlap in the head or tail block. Try to + // trim both blocks. Skip this source chunk in case it still overlaps with the used ranges. + if (AlignHead(start, length) && !used_ranges.Overlaps(*start, *length)) { + return true; + } + if (AlignTail(start, length) && !used_ranges.Overlaps(*start, *length)) { + return true; + } + + printf("Failed to remove the overlapped block ranges; skip the source\n"); + return false; +} + +static const struct option OPTIONS[] = { + { "zip-mode", no_argument, nullptr, 'z' }, + { "bonus-file", required_argument, nullptr, 'b' }, + { "block-limit", required_argument, nullptr, 0 }, + { "debug-dir", required_argument, nullptr, 0 }, + { nullptr, 0, nullptr, 0 }, +}; + ImageChunk::ImageChunk(int type, size_t start, const std::vector* file_content, size_t raw_data_len, std::string entry_name) : type_(type), @@ -371,6 +436,12 @@ bool PatchChunk::RawDataIsSmaller(const ImageChunk& tgt, size_t patch_size) { return (tgt.GetType() == CHUNK_NORMAL && (target_len <= 160 || target_len < patch_size)); } +void PatchChunk::UpdateSourceOffset(const SortedRangeSet& src_range) { + if (type_ == CHUNK_DEFLATE) { + source_start_ = src_range.GetOffsetInRangeSet(source_start_); + } +} + // Header size: // header_type 4 bytes // CHUNK_NORMAL 8*3 = 24 bytes @@ -572,7 +643,7 @@ bool ZipModeImage::InitializeChunks(const std::string& filename, ZipArchiveHandl ZipString name; ZipEntry entry; while ((ret = Next(cookie, &entry, &name)) == 0) { - if (entry.method == kCompressDeflated) { + if (entry.method == kCompressDeflated || limit_ > 0) { std::string entry_name(name.name, name.name + name.name_length); temp_entries.emplace_back(entry_name, entry); } @@ -595,6 +666,17 @@ bool ZipModeImage::InitializeChunks(const std::string& filename, ZipArchiveHandl return false; } } + + // Add the end of zip file (mainly central directory) as a normal chunk. + size_t entries_end = 0; + if (!temp_entries.empty()) { + entries_end = static_cast(temp_entries.back().second.offset + + temp_entries.back().second.compressed_length); + } + CHECK_LT(entries_end, file_content_.size()); + chunks_.emplace_back(CHUNK_NORMAL, entries_end, &file_content_, + file_content_.size() - entries_end); + return true; } @@ -635,7 +717,21 @@ bool ZipModeImage::InitializeChunks(const std::string& filename, ZipArchiveHandl bool ZipModeImage::AddZipEntryToChunks(ZipArchiveHandle handle, const std::string& entry_name, ZipEntry* entry) { size_t compressed_len = entry->compressed_length; - if (entry->method == kCompressDeflated) { + if (compressed_len == 0) return true; + + // Split the entry into several normal chunks if it's too large. + if (limit_ > 0 && compressed_len > limit_) { + int count = 0; + while (compressed_len > 0) { + size_t length = std::min(limit_, compressed_len); + std::string name = entry_name + "-" + std::to_string(count); + chunks_.emplace_back(CHUNK_NORMAL, entry->offset + limit_ * count, &file_content_, length, + name); + + count++; + compressed_len -= length; + } + } else if (entry->method == kCompressDeflated) { size_t uncompressed_len = entry->uncompressed_length; std::vector uncompressed_data(uncompressed_len); int ret = ExtractToMemory(handle, entry, uncompressed_data.data(), uncompressed_len); @@ -697,10 +793,23 @@ const ImageChunk* ZipModeImage::FindChunkByName(const std::string& name, bool fi return nullptr; } for (auto& chunk : chunks_) { - if ((chunk.GetType() == CHUNK_DEFLATE || find_normal) && chunk.GetEntryName() == name) { + if (chunk.GetType() != CHUNK_DEFLATE && !find_normal) { + continue; + } + + if (chunk.GetEntryName() == name) { return &chunk; } + + // Edge case when target chunk is split due to size limit but source chunk isn't. + if (name == (chunk.GetEntryName() + "-0") || chunk.GetEntryName() == (name + "-0")) { + return &chunk; + } + + // TODO handle the .so files with incremental version number. + // (e.g. lib/arm64-v8a/libcronet.59.0.3050.4.so) } + return nullptr; } @@ -738,24 +847,214 @@ bool ZipModeImage::CheckAndProcessChunks(ZipModeImage* tgt_image, ZipModeImage* // For zips, we only need merge normal chunks for the target: deflated chunks are matched via // filename, and normal chunks are patched using the entire source file as the source. - tgt_image->MergeAdjacentNormalChunks(); - tgt_image->DumpChunks(); + if (tgt_image->limit_ == 0) { + tgt_image->MergeAdjacentNormalChunks(); + tgt_image->DumpChunks(); + } return true; } -bool ZipModeImage::GeneratePatches(const ZipModeImage& tgt_image, const ZipModeImage& src_image, - const std::string& patch_name) { +// For each target chunk, look for the corresponding source chunk by the zip_entry name. If +// found, add the range of this chunk in the original source file to the block aligned source +// ranges. Construct the split src & tgt image once the size of source range reaches limit. +bool ZipModeImage::SplitZipModeImageWithLimit(const ZipModeImage& tgt_image, + const ZipModeImage& src_image, + std::vector* split_tgt_images, + std::vector* split_src_images, + std::vector* split_src_ranges) { + CHECK_EQ(tgt_image.limit_, src_image.limit_); + size_t limit = tgt_image.limit_; + + src_image.DumpChunks(); + printf("Splitting %zu tgt chunks...\n", tgt_image.NumOfChunks()); + + SortedRangeSet used_src_ranges; // ranges used for previous split source images. + + // Reserve the central directory in advance for the last split image. + const auto& central_directory = src_image.cend() - 1; + CHECK_EQ(CHUNK_NORMAL, central_directory->GetType()); + used_src_ranges.Insert(central_directory->GetStartOffset(), + central_directory->DataLengthForPatch()); + + SortedRangeSet src_ranges; + std::vector split_src_chunks; + std::vector split_tgt_chunks; + for (auto tgt = tgt_image.cbegin(); tgt != tgt_image.cend(); tgt++) { + const ImageChunk* src = src_image.FindChunkByName(tgt->GetEntryName(), true); + if (src == nullptr) { + split_tgt_chunks.emplace_back(CHUNK_NORMAL, tgt->GetStartOffset(), &tgt_image.file_content_, + tgt->GetRawDataLength()); + continue; + } + + size_t src_offset = src->GetStartOffset(); + size_t src_length = src->GetRawDataLength(); + + CHECK(src_length > 0); + CHECK_LE(src_length, limit); + + // Make sure this source range hasn't been used before so that the src_range pieces don't + // overlap with each other. + if (!RemoveUsedBlocks(&src_offset, &src_length, used_src_ranges)) { + split_tgt_chunks.emplace_back(CHUNK_NORMAL, tgt->GetStartOffset(), &tgt_image.file_content_, + tgt->GetRawDataLength()); + } else if (src_ranges.blocks() * BLOCK_SIZE + src_length <= limit) { + src_ranges.Insert(src_offset, src_length); + + // Add the deflate source chunk if it hasn't been aligned. + if (src->GetType() == CHUNK_DEFLATE && src_length == src->GetRawDataLength()) { + split_src_chunks.push_back(*src); + split_tgt_chunks.push_back(*tgt); + } else { + // TODO split smarter to avoid alignment of large deflate chunks + split_tgt_chunks.emplace_back(CHUNK_NORMAL, tgt->GetStartOffset(), &tgt_image.file_content_, + tgt->GetRawDataLength()); + } + } else { + ZipModeImage::AddSplitImageFromChunkList(tgt_image, src_image, src_ranges, split_tgt_chunks, + split_src_chunks, split_tgt_images, + split_src_images); + + split_tgt_chunks.clear(); + split_src_chunks.clear(); + used_src_ranges.Insert(src_ranges); + split_src_ranges->push_back(std::move(src_ranges)); + src_ranges.Clear(); + + // We don't have enough space for the current chunk; start a new split image and handle + // this chunk there. + tgt--; + } + } + + // TODO Trim it in case the CD exceeds limit too much. + src_ranges.Insert(central_directory->GetStartOffset(), central_directory->DataLengthForPatch()); + ZipModeImage::AddSplitImageFromChunkList(tgt_image, src_image, src_ranges, split_tgt_chunks, + split_src_chunks, split_tgt_images, split_src_images); + split_src_ranges->push_back(std::move(src_ranges)); + + ValidateSplitImages(*split_tgt_images, *split_src_images, *split_src_ranges, + tgt_image.file_content_.size()); + + return true; +} + +void ZipModeImage::AddSplitImageFromChunkList(const ZipModeImage& tgt_image, + const ZipModeImage& src_image, + const SortedRangeSet& split_src_ranges, + const std::vector& split_tgt_chunks, + const std::vector& split_src_chunks, + std::vector* split_tgt_images, + std::vector* split_src_images) { + CHECK(!split_tgt_chunks.empty()); + // Target chunks should occupy at least one block. + // TODO put a warning and change the type to raw if it happens in extremely rare cases. + size_t tgt_size = split_tgt_chunks.back().GetStartOffset() + + split_tgt_chunks.back().DataLengthForPatch() - + split_tgt_chunks.front().GetStartOffset(); + CHECK_GE(tgt_size, BLOCK_SIZE); + + std::vector aligned_tgt_chunks; + + // Align the target chunks in the beginning with BLOCK_SIZE. + size_t i = 0; + while (i < split_tgt_chunks.size()) { + size_t tgt_start = split_tgt_chunks[i].GetStartOffset(); + size_t tgt_length = split_tgt_chunks[i].GetRawDataLength(); + + // Current ImageChunk is long enough to align. + if (AlignHead(&tgt_start, &tgt_length)) { + aligned_tgt_chunks.emplace_back(CHUNK_NORMAL, tgt_start, &tgt_image.file_content_, + tgt_length); + break; + } + + i++; + } + CHECK_LT(i, split_tgt_chunks.size()); + aligned_tgt_chunks.insert(aligned_tgt_chunks.end(), split_tgt_chunks.begin() + i + 1, + split_tgt_chunks.end()); + CHECK(!aligned_tgt_chunks.empty()); + + // Add a normal chunk to align the contents in the end. + size_t end_offset = + aligned_tgt_chunks.back().GetStartOffset() + aligned_tgt_chunks.back().GetRawDataLength(); + if (end_offset % BLOCK_SIZE != 0 && end_offset < tgt_image.file_content_.size()) { + aligned_tgt_chunks.emplace_back(CHUNK_NORMAL, end_offset, &tgt_image.file_content_, + BLOCK_SIZE - (end_offset % BLOCK_SIZE)); + } + + ZipModeImage split_tgt_image(false); + split_tgt_image.Initialize(std::move(aligned_tgt_chunks), {}); + split_tgt_image.MergeAdjacentNormalChunks(); + + // Construct the dummy source file based on the src_ranges. + std::vector src_content; + for (const auto& r : split_src_ranges) { + size_t end = std::min(src_image.file_content_.size(), r.second * BLOCK_SIZE); + src_content.insert(src_content.end(), src_image.file_content_.begin() + r.first * BLOCK_SIZE, + src_image.file_content_.begin() + end); + } + + // We should not have an empty src in our design; otherwise we will encounter an error in + // bsdiff since src_content.data() == nullptr. + CHECK(!src_content.empty()); + + ZipModeImage split_src_image(true); + split_src_image.Initialize(split_src_chunks, std::move(src_content)); + + split_tgt_images->push_back(std::move(split_tgt_image)); + split_src_images->push_back(std::move(split_src_image)); +} + +void ZipModeImage::ValidateSplitImages(const std::vector& split_tgt_images, + const std::vector& split_src_images, + std::vector& split_src_ranges, + size_t total_tgt_size) { + CHECK_EQ(split_tgt_images.size(), split_src_images.size()); + + printf("Validating %zu images\n", split_tgt_images.size()); + + // Verify that the target image pieces is continuous and can add up to the total size. + size_t last_offset = 0; + for (const auto& tgt_image : split_tgt_images) { + CHECK(!tgt_image.chunks_.empty()); + + CHECK_EQ(last_offset, tgt_image.chunks_.front().GetStartOffset()); + CHECK(last_offset % BLOCK_SIZE == 0); + + // Check the target chunks within the split image are continuous. + for (const auto& chunk : tgt_image.chunks_) { + CHECK_EQ(last_offset, chunk.GetStartOffset()); + last_offset += chunk.GetRawDataLength(); + } + } + CHECK_EQ(total_tgt_size, last_offset); + + // Verify that the source ranges are mutually exclusive. + CHECK_EQ(split_src_images.size(), split_src_ranges.size()); + SortedRangeSet used_src_ranges; + for (size_t i = 0; i < split_src_ranges.size(); i++) { + CHECK(!used_src_ranges.Overlaps(split_src_ranges[i])) + << "src range " << split_src_ranges[i].ToString() << " overlaps " + << used_src_ranges.ToString(); + used_src_ranges.Insert(split_src_ranges[i]); + } +} + +bool ZipModeImage::GeneratePatchesInternal(const ZipModeImage& tgt_image, + const ZipModeImage& src_image, + std::vector* patch_chunks) { printf("Construct patches for %zu chunks...\n", tgt_image.NumOfChunks()); - std::vector patch_chunks; - patch_chunks.reserve(tgt_image.NumOfChunks()); + patch_chunks->clear(); saidx_t* bsdiff_cache = nullptr; for (size_t i = 0; i < tgt_image.NumOfChunks(); i++) { const auto& tgt_chunk = tgt_image[i]; if (PatchChunk::RawDataIsSmaller(tgt_chunk, 0)) { - patch_chunks.emplace_back(tgt_chunk); + patch_chunks->emplace_back(tgt_chunk); continue; } @@ -776,13 +1075,23 @@ bool ZipModeImage::GeneratePatches(const ZipModeImage& tgt_image, const ZipModeI tgt_chunk.GetRawDataLength()); if (PatchChunk::RawDataIsSmaller(tgt_chunk, patch_data.size())) { - patch_chunks.emplace_back(tgt_chunk); + patch_chunks->emplace_back(tgt_chunk); } else { - patch_chunks.emplace_back(tgt_chunk, src_ref, std::move(patch_data)); + patch_chunks->emplace_back(tgt_chunk, src_ref, std::move(patch_data)); } } free(bsdiff_cache); + CHECK_EQ(patch_chunks->size(), tgt_image.NumOfChunks()); + return true; +} + +bool ZipModeImage::GeneratePatches(const ZipModeImage& tgt_image, const ZipModeImage& src_image, + const std::string& patch_name) { + std::vector patch_chunks; + + ZipModeImage::GeneratePatchesInternal(tgt_image, src_image, &patch_chunks); + CHECK_EQ(tgt_image.NumOfChunks(), patch_chunks.size()); android::base::unique_fd patch_fd( @@ -795,6 +1104,66 @@ bool ZipModeImage::GeneratePatches(const ZipModeImage& tgt_image, const ZipModeI return PatchChunk::WritePatchDataToFd(patch_chunks, patch_fd); } +bool ZipModeImage::GeneratePatches(const std::vector& split_tgt_images, + const std::vector& split_src_images, + const std::vector& split_src_ranges, + const std::string& patch_name, const std::string& debug_dir) { + printf("Construct patches for %zu split images...\n", split_tgt_images.size()); + + android::base::unique_fd patch_fd( + open(patch_name.c_str(), O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR)); + if (patch_fd == -1) { + printf("failed to open \"%s\": %s\n", patch_name.c_str(), strerror(errno)); + return false; + } + + for (size_t i = 0; i < split_tgt_images.size(); i++) { + std::vector patch_chunks; + if (!ZipModeImage::GeneratePatchesInternal(split_tgt_images[i], split_src_images[i], + &patch_chunks)) { + printf("failed to generate split patch\n"); + return false; + } + + for (auto& p : patch_chunks) { + p.UpdateSourceOffset(split_src_ranges[i]); + } + + if (!PatchChunk::WritePatchDataToFd(patch_chunks, patch_fd)) { + return false; + } + + // Write the split source & patch into the debug directory. + if (!debug_dir.empty()) { + std::string src_name = android::base::StringPrintf("%s/src-%zu", debug_dir.c_str(), i); + android::base::unique_fd fd( + open(src_name.c_str(), O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR)); + + if (fd == -1) { + printf("Failed to open %s\n", src_name.c_str()); + return false; + } + if (!android::base::WriteFully(fd, split_src_images[i].PseudoSource().DataForPatch(), + split_src_images[i].PseudoSource().DataLengthForPatch())) { + printf("Failed to write split source data into %s\n", src_name.c_str()); + return false; + } + + std::string patch_name = android::base::StringPrintf("%s/patch-%zu", debug_dir.c_str(), i); + fd.reset(open(patch_name.c_str(), O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR)); + + if (fd == -1) { + printf("Failed to open %s\n", patch_name.c_str()); + return false; + } + if (!PatchChunk::WritePatchDataToFd(patch_chunks, fd)) { + return false; + } + } + } + return true; +} + bool ImageModeImage::Initialize(const std::string& filename) { if (!ReadFile(filename, &file_content_)) { return false; @@ -1026,11 +1395,14 @@ bool ImageModeImage::GeneratePatches(const ImageModeImage& tgt_image, int imgdiff(int argc, const char** argv) { bool zip_mode = false; std::vector bonus_data; + size_t blocks_limit = 0; + std::string debug_dir; int opt; + int option_index; optind = 1; // Reset the getopt state so that we can call it multiple times for test. - while ((opt = getopt(argc, const_cast(argv), "zb:")) != -1) { + while ((opt = getopt_long(argc, const_cast(argv), "zb:", OPTIONS, &option_index)) != -1) { switch (opt) { case 'z': zip_mode = true; @@ -1055,6 +1427,16 @@ int imgdiff(int argc, const char** argv) { } break; } + case 0: { + std::string name = OPTIONS[option_index].name; + if (name == "block-limit" && !android::base::ParseUint(optarg, &blocks_limit)) { + printf("failed to parse size blocks_limit: %s\n", optarg); + return 1; + } else if (name == "debug-dir") { + debug_dir = optarg; + } + break; + } default: printf("unexpected opt: %s\n", optarg); return 2; @@ -1062,13 +1444,20 @@ int imgdiff(int argc, const char** argv) { } if (argc - optind != 3) { - printf("usage: %s [-z] [-b ] \n", argv[0]); + printf("usage: %s [options] \n", argv[0]); + printf( + " -z , Generate patches in zip mode, src and tgt should be zip files.\n" + " -b , Bonus file in addition to src, image mode only.\n" + " --block-limit, For large zips, split the src and tgt based on the block limit;\n" + " and generate patches between each pair of pieces. Concatenate these\n" + " patches together and output them into .\n" + " --debug_dir, Debug directory to put the split srcs and patches, zip mode only.\n"); return 2; } if (zip_mode) { - ZipModeImage src_image(true); - ZipModeImage tgt_image(false); + ZipModeImage src_image(true, blocks_limit * BLOCK_SIZE); + ZipModeImage tgt_image(false, blocks_limit * BLOCK_SIZE); if (!src_image.Initialize(argv[optind])) { return 1; @@ -1080,9 +1469,25 @@ int imgdiff(int argc, const char** argv) { if (!ZipModeImage::CheckAndProcessChunks(&tgt_image, &src_image)) { return 1; } + + // TODO save and output the split information so that caller can create split transfer lists + // accordingly. + // Compute bsdiff patches for each chunk's data (the uncompressed data, in the case of // deflate chunks). - if (!ZipModeImage::GeneratePatches(tgt_image, src_image, argv[optind + 2])) { + if (blocks_limit > 0) { + std::vector split_tgt_images; + std::vector split_src_images; + std::vector split_src_ranges; + ZipModeImage::SplitZipModeImageWithLimit(tgt_image, src_image, &split_tgt_images, + &split_src_images, &split_src_ranges); + + if (!ZipModeImage::GeneratePatches(split_tgt_images, split_src_images, split_src_ranges, + argv[optind + 2], debug_dir)) { + return 1; + } + + } else if (!ZipModeImage::GeneratePatches(tgt_image, src_image, argv[optind + 2])) { return 1; } } else { -- cgit v1.2.3