libzypp 17.35.11
mediablocklist.cc
Go to the documentation of this file.
1/*---------------------------------------------------------------------\
2| ____ _ __ __ ___ |
3| |__ / \ / / . \ . \ |
4| / / \ V /| _/ _/ |
5| / /__ | | | | | | |
6| /_____||_| |_| |_| |
7| |
8\---------------------------------------------------------------------*/
101#include "mediablocklist.h"
102
103#include <sys/types.h>
104#include <stdio.h>
105#include <stdlib.h>
106#include <string.h>
107
108#include <utility>
109#include <vector>
110#include <iostream>
111#include <fstream>
112
117
118using namespace zypp::base;
119
120namespace zypp {
121 namespace media {
122
123 namespace {
124
125 struct rsum {
126 unsigned short a = 0;
127 unsigned short b = 0;
128 } __attribute__((packed));
129
130 /* rcksum_calc_rsum_block(data, data_len)
131 * Calculate the rsum for a single block of data. */
132 rsum rcksum_calc_rsum_block(const unsigned char *data, size_t len) {
133 unsigned short a = 0;
134 unsigned short b = 0;
135
136 while (len) {
137 unsigned char c = *data++;
138 a += c;
139 b += len * c;
140 len--;
141 }
142 return rsum{ a, b };
143 }
144
145 // update the rsum by removing the old and adding the new char
146 #define UPDATE_RSUM(a, b, oldc, newc, bshift) do { (a) += ((unsigned char)(newc)) - ((unsigned char)(oldc)); (b) += (a) - ((oldc) << (bshift)); } while (0)
147
152 void inline truncateRsum ( unsigned int &rs, const int rsumlen )
153 {
154 switch(rsumlen)
155 {
156 case 3:
157 rs &= 0xffffff;
158 break;
159 case 2:
160 rs &= 0xffff;
161 break;
162 case 1:
163 rs &= 0xff;
164 break;
165 default:
166 break;
167 }
168 }
169
170 }
171
173: filesize(size),
174 haveblocks(false),
175 chksumlen(0),
176 chksumpad(0),
177 rsumlen(0),
178 rsumseq(0),
179 rsumpad(0)
180{ }
181
182size_t
183MediaBlockList::addBlock(off_t off, size_t size)
184{
185 haveblocks = true;
186 blocks.push_back(MediaBlock( off, size ));
187 return blocks.size() - 1;
188}
189
190void
191MediaBlockList::setFileChecksum(std::string ctype, int cl, unsigned char *c)
192{
193 if (!cl)
194 return;
195 fsumtype = std::move(ctype);
196 fsum.resize(cl);
197 memcpy(&fsum[0], c, cl);
198}
199
201{
202 return fsumtype;
203}
204
206{
207 return fsum;
208}
209
210bool
212{
213 return digest.create(fsumtype);
214}
215
216bool
218{
219 if (!haveFileChecksum())
220 return true;
221 std::vector<unsigned char>dig = digest.digestVector();
222 if (dig.empty() || dig.size() < fsum.size())
223 return false;
224 return memcmp(&dig[0], &fsum[0], fsum.size()) ? false : true;
225}
226
227void
228MediaBlockList::setChecksum(size_t blkno, const std::string& cstype, int csl, unsigned char *cs, size_t cspad)
229{
230 if (!csl)
231 return;
232 if (!chksumlen)
233 {
234 if (blkno)
235 return;
236 chksumlen = csl;
237 chksumtype = cstype;
238 chksumpad = cspad;
239 }
240 if (csl != chksumlen || cstype != chksumtype || cspad != chksumpad || blkno != chksums.size() / chksumlen)
241 return;
242 chksums.resize(chksums.size() + csl);
243 memcpy(&chksums[csl * blkno], cs, csl);
244}
245
246void
247MediaBlockList::setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad)
248{
249 if (!rsl)
250 return;
251 if (!rsumlen)
252 {
253 if (blkno)
254 return;
255 rsumlen = rsl;
256 rsumpad = rspad;
257 }
258 if (rsl != rsumlen || rspad != rsumpad || blkno != rsums.size())
259 return;
260 rsums.push_back(rs);
261}
262
263void
265{
266 rsumseq = seq;
267}
268
269bool
271{
272 return digest.create(chksumtype);
273}
274
275bool
276MediaBlockList::verifyDigest(size_t blkno, Digest &digest) const
277{
278 if (!haveChecksum(blkno))
279 return true;
280 size_t size = blocks[blkno].size;
281 if (!size)
282 return true;
283 if (chksumpad > size)
284 {
285 char pad[chksumpad - size];
286 memset(pad, 0, chksumpad - size);
287 digest.update(pad, chksumpad - size);
288 }
289 std::vector<unsigned char>dig = digest.digestVector();
290 if (dig.empty() || dig.size() < size_t(chksumlen))
291 return false;
292 return memcmp(&dig[0], &chksums[chksumlen * blkno], chksumlen) ? false : true;
293}
294
295unsigned int
296MediaBlockList::updateRsum(unsigned int rs, const char* bytes, size_t len) const
297{
298 if (!len)
299 return rs;
300
301 unsigned short s = 0, m = 0;
302 s = (rs >> 16) & 65535;
303 m = rs & 65535;
304 for (; len > 0 ; len--)
305 {
306 unsigned short c = (unsigned char)*bytes++;
307 s += c;
308 m += s;
309 }
310 return (s & 65535) << 16 | (m & 65535);
311}
312
313bool
314MediaBlockList::verifyRsum(size_t blkno, unsigned int rs) const
315{
316 if (!haveRsum(blkno))
317 return true;
318 size_t size = blocks[blkno].size;
319 if (!size)
320 return true;
321 if (rsumpad > size)
322 {
323 unsigned short s = 0, m = 0;
324 s = (rs >> 16) & 65535;
325 m = rs & 65535;
326 m += s * (rsumpad - size);
327 rs = (s & 65535) << 16 | (m & 65535);
328 }
329 truncateRsum( rs, rsumlen );
330 return rs == rsums[blkno];
331}
332
333bool
334MediaBlockList::checkRsum(size_t blkno, const unsigned char *buf, size_t bufl) const
335{
336 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
337 return false;
338 unsigned int rs = updateRsum(0, (const char *)buf, blocks[blkno].size);
339 return verifyRsum(blkno, rs);
340}
341
342bool
343MediaBlockList::checkChecksum(size_t blkno, const unsigned char *buf, size_t bufl) const
344{
345 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
346 return false;
347 Digest dig;
348 if (!createDigest(dig))
349 return false;
350 dig.update((const char *)buf, blocks[blkno].size);
351 return verifyDigest(blkno, dig);
352}
353
355{
356 if ( !haveChecksum(blkno) )
357 return {};
358
359 UByteArray buf ( chksumlen, '\0' );
360 memcpy( buf.data(), chksums.data()+(chksumlen * blkno), chksumlen );
361 return buf;
362}
363
365{
366 return chksumtype;
367}
368
370{
371 return chksumpad;
372}
373
374// specialized version of checkChecksum that can deal with a "rotated" buffer
375bool
376MediaBlockList::checkChecksumRotated(size_t blkno, const unsigned char *buf, size_t bufl, size_t start) const
377{
378 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
379 return false;
380 if (start == bufl)
381 start = 0;
382 Digest dig;
383 if (!createDigest(dig))
384 return false;
385 size_t size = blocks[blkno].size;
386 size_t len = bufl - start > size ? size : bufl - start;
387 dig.update((const char *)buf + start, len);
388 if (size > len)
389 dig.update((const char *)buf, size - len);
390 return verifyDigest(blkno, dig);
391}
392
393// write block to the file. can also deal with "rotated" buffers
394void
395MediaBlockList::writeBlock(size_t blkno, FILE *fp, const unsigned char *buf, size_t bufl, size_t start, std::vector<bool> &found) const
396{
397 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
398 return;
399 off_t off = blocks[blkno].off;
400 size_t size = blocks[blkno].size;
401 if (fseeko(fp, off, SEEK_SET))
402 return;
403 if (start == bufl)
404 start = 0;
405 size_t len = bufl - start > size ? size : bufl - start;
406 if (fwrite(buf + start, len, 1, fp) != 1)
407 return;
408 if (size > len && fwrite(buf, size - len, 1, fp) != 1)
409 return;
410 found[blkno] = true;
411 found[blocks.size()] = true;
412}
413
414static size_t
415fetchnext(FILE *fp, unsigned char *bp, size_t blksize, size_t pushback, unsigned char *pushbackp)
416{
417 size_t l = blksize;
418 int c = 0;
419
420 if (pushback)
421 {
422 if (pushbackp != bp)
423 memmove(bp, pushbackp, pushback);
424 bp += pushback;
425 l -= pushback;
426 }
427 while (l)
428 {
429 c = getc(fp);
430 if (c == EOF)
431 break;
432 *bp++ = c;
433 l--;
434 }
435 if (l)
436 memset(bp, 0, l);
437 return blksize - l;
438}
439
440void MediaBlockList::reuseBlocks(FILE *wfp, const std::string& filename)
441{
442
444
445 if ( !chksumlen ) {
446 DBG << "Delta XFER: Can not reuse blocks because we have no chksumlen" << std::endl;
447 return;
448 }
449
450 if ( (fp = fopen(filename.c_str(), "r")) == 0 ) {
451 DBG << "Delta XFER: Can not reuse blocks, unable to open file "<< filename << std::endl;
452 return;
453 }
454
455 size_t nblks = blocks.size();
456 std::vector<bool> found( nblks + 1 );
457 if (rsumlen && !rsums.empty()) {
458
459 const auto rsumAMask = rsumlen < 3 ? 0 : rsumlen == 3 ? 0xff : 0xffff;
460
461 // we are building a array of rsum structs to directly access a and b parts of the checksum
462 // we make the array bigger so that the code calculating the rsum hashes does not need to care
463 // about the array size
464 // @TODO move that to the function adding new rsums when removing the old code
465 auto zsyncRsumsData = std::make_unique<rsum[]>( rsums.size() + rsumseq );
466 auto zsyncRsums = zsyncRsumsData.get();
467 for ( std::size_t i = 0; i < rsums.size(); i++ ) {
468 const auto &rs = rsums[i];
469 unsigned short s = 0, m = 0;
470 s = (rs >> 16) & 65535;
471 m = rs & 65535;
472 zsyncRsums[i] = rsum{ s, m };
473 }
474
475 // we use the same code as zsync to calc the hash
476 const auto & calc_rhash = [&]( const rsum* e ) -> unsigned {
477 unsigned h = e[0].b;
478 if ( this->rsumseq > 1 ) {
479 for ( uint i = 1; i < rsumseq; i++ ) {
480 h ^= e[i].b << 3;
481 }
482 } else {
483 h ^= ( e[0].a & rsumAMask ) << 3;
484 }
485 return h;
486 };
487
488 size_t blksize = blocks[0].size;
489 if (nblks == 1 && rsumpad && rsumpad > blksize)
490 blksize = rsumpad;
491
492 // create hash of checksums
493 // build the hashtable
494 uint rsumHashMask = 0;
495 {
496 int i = 16;
497
498 /* Try hash size of 2^i; step down the value of i until we find a good size */
499 while ((2 << (i - 1)) > nblks && i > 4)
500 i--;
501
502 /* Allocate hash based on rsum */
503 rsumHashMask = (2 << i) - 1;
504 }
505
506 // a array indexed via hash with lists of blocks resulting in the same rsum hash
507 auto rsumHashTableData = std::make_unique<std::vector<size_t>[]>( rsumHashMask + 1 );
508 auto rsumHashTable = rsumHashTableData.get();
509
510 // Now fill in the hash tables.
511 for ( size_t id = 0; id < nblks; id++) {
512 const auto hash = calc_rhash( &zsyncRsums[id] );
513 auto &hashList = rsumHashTable[ hash & rsumHashMask ];
514 hashList.push_back(id);
515 }
516
517 // we read in 16 sequences at once to speed up processing
518 constexpr auto BLOCKCNT = 16;
519
520 // we allocate the buffer so that we always have the data to verify 16 blocks, if we need to do
521 // sequence matching we grow the buffer accordingly
522 const auto readBufSize = blksize * rsumseq * BLOCKCNT;
523
524 // buffer thats going to hold our cached data
525 auto readBufData = std::make_unique<unsigned char[]>( readBufSize );
526 memset(readBufData.get(), 0, blksize);
527
528 // avoid using .get() all the time
529 auto readBuf = readBufData.get();
530
531 // our running checksums for the blocks we need to match in sequence
532 auto seqRsumsData = std::make_unique<rsum[]> ( rsumseq );
533 auto seqRsums = seqRsumsData.get();
534
535 // use byteshift instead of multiplication if the blksize is a power of 2
536 // a value is a power of 2 if ( N & N-1 ) == 0
537 int bshift = 0; // how many bytes do we need to shift
538 if ((blksize & (blksize - 1)) == 0)
539 for (bshift = 0; size_t(1 << bshift) != blksize; bshift++)
540 ;
541
542 bool init = true;
543 // when we are in a run of matches, we remember which block ID would need to match next in order
544 // to continue writing
545 std::optional<size_t> nextReqMatchInSequence;
546 off_t dataOffset = 0; //< Our current read offset in the buffer
547 off_t dataLen = 0; //< The length of our read buffer
548
549 // helper lambda that follows a list of hashmap entries and tries to write those that match
550 const auto &tryWriteMatchingBlocks = [&]( const std::vector<size_t> &list, const u_char *currBuf, uint reqMatches ){
551 // the number of blocks we have transferred to the target file
552 int targetBlocksWritten = 0;
553
554 // reset the next match hint
555 nextReqMatchInSequence.reset();
556
557 for ( const auto blkno : list ) {
558
559 if ( found[blkno] )
560 continue;
561
562 const auto blockRsum = &zsyncRsums[blkno];
563
564 uint weakMatches = 0;
565
566 // first check only the current block, we maybe can skip checking the others
567 // if we are in a run of matches
568 if ( (seqRsums[0].a & rsumAMask) != blockRsum[0].a ||
569 seqRsums[0].b != blockRsum[0].b )
570 continue;
571
572 weakMatches++;
573
574 for ( uint i = 1; i < reqMatches; i++ ) {
575 if ( (seqRsums[i].a & rsumAMask) != blockRsum[i].a ||
576 seqRsums[i].b != blockRsum[i].b )
577 break;
578 weakMatches++;
579 }
580
581 if ( weakMatches < reqMatches )
582 continue;
583
584 // we have a weak match, now we need to calc the checksums for the blocks
585 uint realMatches = 0;
586 for( uint i = 0; i < reqMatches; i++ ) {
587 if ( !checkChecksum(blkno + i, currBuf + ( i * blksize ), blksize ) ) {
588 break;
589 }
590 realMatches++;
591 }
592
593 // check if we have the amount of matches we need ( only 1 if we are in a block sequence )
594 if( realMatches < reqMatches )
595 continue;
596
597 // we found blocks that match , write them to target but keep searching the hashmap
598 // in case we have redundancies
599 const auto nextPossibleMatch = blkno + realMatches;
600 if ( !found[nextPossibleMatch] )
601 nextReqMatchInSequence = nextPossibleMatch; // remember that we are currently in a run of matches, next iteration we just need to look at one block
602
603 for( uint i = 0; i < realMatches; i++ ) {
604 writeBlock( blkno + i, wfp, currBuf + ( i * blksize ), blksize, 0, found );
605 targetBlocksWritten++;
606 }
607 }
608 return targetBlocksWritten;
609 };
610
611 if (!rsumseq)
612 rsumseq = nblks > 1 && chksumlen < 16 ? 2 : 1;
613
614 const off_t seqMatchLen = ( blksize * rsumseq ); //< how many bytes do we need to match when searching a block
615
616 while (! feof(fp) ) {
617 if ( init ) {
618 // fill the buffer for the first time
619 dataLen = fread( readBuf, 1, readBufSize, fp );
620 init = false;
621 } else {
622 // move the remaining data to the begin and read from the file to fill up the buffer again
623 const auto remainLen = dataLen-dataOffset;
624 if ( remainLen )
625 memmove( readBuf, readBuf+dataOffset, remainLen );
626
627 dataLen = fread( readBuf+remainLen, 1, readBufSize-remainLen, fp );
628 dataLen += remainLen;
629 dataOffset = 0;
630 }
631
632 // if we hit eof, pad with zeros
633 if ( feof(fp) ) {
634 memset( readBuf + dataLen, 0, readBufSize - dataLen );
635 dataLen = readBufSize;
636 }
637
638 if ( dataLen < seqMatchLen )
639 return;
640
641 // intialize our first set of checksums
642 for( uint i = 0; i < rsumseq; i++ )
643 seqRsums[i] = rcksum_calc_rsum_block( readBuf + ( i * blksize ), blksize );
644
645 //read over the buffer we have allocated so far
646 while ( true ) {
647
648 if ( dataOffset + seqMatchLen > dataLen )
649 break;
650
651 u_char *currBuf = readBuf + dataOffset;
652
653 // the number of deltafile blocks we have matched, e.g. how much blocks
654 // can we skip forward
655 uint deltaBlocksMatched = 0;
656
657 if ( nextReqMatchInSequence.has_value() ) {
658 if ( tryWriteMatchingBlocks( { *nextReqMatchInSequence }, currBuf, 1 ) > 0 )
659 deltaBlocksMatched = 1;
660
661 } else {
662 const auto hash = calc_rhash( seqRsums );
663
664 // reference to the list of blocks that share our calculated hash
665 auto &blockListForHash = rsumHashTable[ hash & rsumHashMask ];
666 if ( blockListForHash.size() ) {
667 if ( tryWriteMatchingBlocks( blockListForHash, currBuf, rsumseq ) > 0 )
668 deltaBlocksMatched = rsumseq;
669 }
670 }
671
672 if ( deltaBlocksMatched > 0 ) {
673 // we jump forward in the buffer to after what we matched
674 dataOffset += ( deltaBlocksMatched * blksize );
675
676 if ( dataOffset + seqMatchLen > dataLen )
677 break;
678
679 if ( deltaBlocksMatched < rsumseq ) {
680 //@TODO move the rsums we already have
681 }
682
683 for( uint i = 0; i < rsumseq; i++ )
684 seqRsums[i] = rcksum_calc_rsum_block( readBuf + dataOffset + ( i * blksize ), blksize );
685
686
687 } else {
688 // we found nothing advance the window by one byte and update the rsums
689 dataOffset++;
690 if ( dataOffset + seqMatchLen > dataLen )
691 break;
692 for ( uint i = 0; i < rsumseq; i++ ) {
693 const auto blkOff = ( i*blksize );
694 u_char oldC = (currBuf + blkOff)[0];
695 u_char newC = (currBuf + blkOff)[blksize];
696 UPDATE_RSUM( seqRsums[i].a, seqRsums[i].b, oldC, newC, bshift );
697 }
698 }
699 }
700 }
701 }
702 else if (chksumlen >= 16)
703 {
704 // dummy variant, just check the checksums
705 size_t bufl = 4096;
706 off_t off = 0;
707 auto buf = std::make_unique<unsigned char[]>( bufl );
708 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
709 {
710 if (off > blocks[blkno].off)
711 continue;
712 size_t blksize = blocks[blkno].size;
713 if (blksize > bufl)
714 {
715 bufl = blksize;
716 buf = std::make_unique<unsigned char[]>( bufl );
717 }
718 size_t skip = blocks[blkno].off - off;
719 while (skip)
720 {
721 size_t l = skip > bufl ? bufl : skip;
722 if (fread(buf.get(), l, 1, fp) != 1)
723 break;
724 skip -= l;
725 off += l;
726 }
727 if (fread(buf.get(), blksize, 1, fp) != 1)
728 break;
729 if (checkChecksum(blkno, buf.get(), blksize))
730 writeBlock(blkno, wfp, buf.get(), blksize, 0, found);
731 off += blksize;
732 }
733 }
734 if (!found[nblks]) {
735 DBG << "Delta XFER: No reusable blocks found for " << filename << std::endl;
736 return;
737 }
738 // now throw out all of the blocks we found
739 std::vector<MediaBlock> nblocks;
740 std::vector<unsigned char> nchksums;
741 std::vector<unsigned int> nrsums;
742
743 size_t originalSize = 0;
744 size_t newSize = 0;
745 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
746 {
747 const auto &blk = blocks[blkno];
748 originalSize += blk.size;
749 if (!found[blkno])
750 {
751 // still need it
752 nblocks.push_back(blk);
753 newSize += blk.size;
754 if (chksumlen && (blkno + 1) * chksumlen <= chksums.size())
755 {
756 nchksums.resize(nblocks.size() * chksumlen);
757 memcpy(&nchksums[(nblocks.size() - 1) * chksumlen], &chksums[blkno * chksumlen], chksumlen);
758 }
759 if (rsumlen && (blkno + 1) <= rsums.size())
760 nrsums.push_back(rsums[blkno]);
761 }
762 }
763 DBG << "Delta XFER: Found blocks to reuse, " << blocks.size() << " vs " << nblocks.size() << ", resused blocks: " << blocks.size() - nblocks.size() << "\n"
764 << "Old transfer size: " << originalSize << " new size: " << newSize << std::endl;
765 blocks = nblocks;
766 chksums = nchksums;
767 rsums = nrsums;
768}
769
770void MediaBlockList::reuseBlocksOld(FILE *wfp, const std::string& filename)
771{
772
774
775 if ( !chksumlen ) {
776 DBG << "Delta XFER: Can not reuse blocks because we have no chksumlen" << std::endl;
777 return;
778 }
779
780 if ( (fp = fopen(filename.c_str(), "r")) == 0 ) {
781 DBG << "Delta XFER: Can not reuse blocks, unable to open file "<< filename << std::endl;
782 return;
783 }
784 size_t nblks = blocks.size();
785 std::vector<bool> found;
786 found.resize(nblks + 1);
787 if (rsumlen && !rsums.empty())
788 {
789 size_t blksize = blocks[0].size;
790 if (nblks == 1 && rsumpad && rsumpad > blksize)
791 blksize = rsumpad;
792
793 // create hash of checksums
794
795 // calculate the size of the hashtable by setting
796 // all bits to 1 up the to currently set MSB
797 // if we have 00010010 we end up with 00011111
798 unsigned int hm = rsums.size() * 2;
799 while (hm & (hm - 1)) {
800 hm &= hm - 1;
801 }
802 hm = hm * 2 - 1;
803
804 // we want at least a size if 0011 1111 1111 1111
805 if (hm < 16383)
806 hm = 16383;
807
808 // simple hashtable of checksums
809 auto rsumHashTable = std::make_unique<unsigned int[]>( hm+1 );
810 memset(rsumHashTable.get(), 0, (hm + 1) * sizeof(unsigned int));
811
812 // insert each rsum into the hash table
813 for (unsigned int i = 0; i < rsums.size(); i++)
814 {
815 if (blocks[i].size != blksize && (i != nblks - 1 || rsumpad != blksize))
816 continue;
817 unsigned int r = rsums[i];
818 unsigned int h = r & hm;
819 unsigned int hh = 7;
820 while (rsumHashTable[h])
821 h = (h + hh++) & hm;
822 rsumHashTable[h] = i + 1;
823 }
824
825 // read in block by block to find matches
826 // the read buffer "buf" works like a ring buffer, means that once we are done with reading a full block
827 // and didn't find a match we start again at buf[0] , filling the buffer up again, rotating until we find
828 // a matching block. Once we find a matching block all we need to do is check if the current offset "i"
829 // is at the end of the buffer, then we can simply write the full buffer out, or if its somewhere in between
830 // then the begin of our block is buf[i+1, bufsize-1] and the end buf[0,i]
831 auto ringBuf = std::make_unique<unsigned char[]>( blksize );
832
833 // we use a second buffer to read in the next block if we are required to match more than one block at the same time.
834 auto buf2 = std::make_unique<unsigned char[]>( blksize );
835
836 // when we are required to match more than one block, it is read into buf2 advancing the file pointer,
837 // to make sure that we do not loose those bytes in case the match fails we remember their count and
838 // start in buf2, in the next loop those will be consumed before reading from the file again
839 size_t pushback = 0;
840 unsigned char *pushbackp = 0;
841
842 // use byteshift instead of multiplication if the blksize is a power of 2
843 // a value is a power of 2 if ( N & N-1 ) == 0
844 int bshift = 0; // how many bytes do we need to shift
845 if ((blksize & (blksize - 1)) == 0)
846 for (bshift = 0; size_t(1 << bshift) != blksize; bshift++)
847 ;
848
849 // a and b are the LS and MS bytes of the checksum, calculated a rolling style Adler32 checksum
850 //
851 // a(k,l) = (\sum_{i=k}^l X_i) \bmod M
852 unsigned short a = 0, b = 0;
853 a = b = 0;
854 memset(ringBuf.get(), 0, blksize);
855 bool eof = 0;
856 bool init = 1;
857
858 if (!rsumseq)
859 rsumseq = nblks > 1 && chksumlen < 16 ? 2 : 1;
860
861 while (!eof)
862 {
863 for (size_t i = 0; i < blksize; i++)
864 {
865 // get the next character from the file
866 // or if there are pushback chars use those
867 int c = 0;
868 if (eof)
869 c = 0;
870 else
871 {
872 if (pushback)
873 {
874 c = *pushbackp++;
875 pushback--;
876 }
877 else
878 c = getc(fp);
879 if (c == EOF)
880 {
881 eof = true;
882 c = 0;
883 if (!i || rsumseq == 2)
884 break;
885 }
886 }
887
888 // calculate the rsum on the fly using recurrence relations, see https://rsync.samba.org/tech_report/node3.html
889 // basically we subtract the checksum value of a byte the leaves the current block window and add the new incoming one
890 // using this trick we do not need to calculate the full block checksum
891 // the least significant part of the checksum ( lower 8 bits ) is simply the sum of all chars in the block , modulo 2^16
892 // zsync uses only a 16bit type to calculate the sums and as far as i can see does not do the modulo per block as the formula
893 // says it should, we might need to do the same
894 int oc = ringBuf[i];
895 ringBuf[i] = c;
896
897 a += c - oc;
898
899 // this is calculates the most significant part of the checksum, bshift should be always set since blocksize
900 // should always be a power of 2
901 if (bshift)
902 b += a - ( oc << bshift );
903 else
904 // This seems to make no sense it does not even factor in the character itself
905 b += 2 * blksize;
906
907 if (init)
908 {
909 // continue reading bytes until we have the full block in our buffer
910 if (size_t(i) != blksize - 1)
911 continue;
912 init = 0;
913 }
914
915 unsigned int r = ((unsigned int)a & 65535) << 16 | ((unsigned int)b & 65535);
916 truncateRsum(r, rsumlen);
917
918 unsigned int h = r & hm;
919 unsigned int hh = 7;
920
921 // go through our hashmap to find all the matching rsums
922 for (; rsumHashTable[h]; h = (h + hh++) & hm)
923 {
924 size_t blkno = rsumHashTable[h] - 1;
925
926 // does the current block match?
927 if (rsums[blkno] != r)
928 continue;
929 if (found[blkno])
930 continue;
931
932 // if we need to always match 2 blocks in sequence, get the next block
933 // and check its checksum
934 if (rsumseq == 2)
935 {
936 if (eof || blkno + 1 >= nblks)
937 continue;
938 pushback = fetchnext(fp, buf2.get(), blksize, pushback, pushbackp);
939 pushbackp = buf2.get();
940 if (!pushback)
941 continue;
942
943 if (!checkRsum(blkno + 1, buf2.get(), blksize))
944 continue;
945 }
946
947 // here we have matched all blocks that we need, do the heavy checksum
948 if (!checkChecksumRotated(blkno, ringBuf.get(), blksize, i + 1))
949 continue;
950
951 // heavy checksum for second block
952 if (rsumseq == 2 && !checkChecksum(blkno + 1, buf2.get(), blksize))
953 continue;
954
955 // write the first and second blocks if applicable
956 writeBlock(blkno, wfp, ringBuf.get(), blksize, i + 1, found);
957 if (rsumseq == 2)
958 {
959 writeBlock(blkno + 1, wfp, buf2.get(), blksize, 0, found);
960 pushback = 0;
961 blkno++;
962 }
963
964 // try to continue as long as we still match blocks
965 while (!eof)
966 {
967 blkno++;
968 pushback = fetchnext(fp, buf2.get(), blksize, pushback, pushbackp);
969 pushbackp = buf2.get();
970 if (!pushback)
971 break;
972
973 if (!checkRsum(blkno, buf2.get(), blksize))
974 break;
975
976 if (!checkChecksum(blkno, buf2.get(), blksize))
977 break;
978
979 writeBlock(blkno, wfp, buf2.get(), blksize, 0, found);
980 pushback = 0;
981 }
982
983 // if we get to this part we found at least a block, skip over the current block and start reading
984 // in a full block
985 init = true;
986 memset(ringBuf.get(), 0, blksize);
987 a = b = 0;
988 i = size_t(-1); // start with 0 on next iteration
989 break;
990 }
991 }
992 }
993 }
994 else if (chksumlen >= 16)
995 {
996 // dummy variant, just check the checksums
997 size_t bufl = 4096;
998 off_t off = 0;
999 auto buf = std::make_unique<unsigned char[]>( bufl );
1000 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
1001 {
1002 if (off > blocks[blkno].off)
1003 continue;
1004 size_t blksize = blocks[blkno].size;
1005 if (blksize > bufl)
1006 {
1007 bufl = blksize;
1008 buf = std::make_unique<unsigned char[]>( bufl );
1009 }
1010 size_t skip = blocks[blkno].off - off;
1011 while (skip)
1012 {
1013 size_t l = skip > bufl ? bufl : skip;
1014 if (fread(buf.get(), l, 1, fp) != 1)
1015 break;
1016 skip -= l;
1017 off += l;
1018 }
1019 if (fread(buf.get(), blksize, 1, fp) != 1)
1020 break;
1021 if (checkChecksum(blkno, buf.get(), blksize))
1022 writeBlock(blkno, wfp, buf.get(), blksize, 0, found);
1023 off += blksize;
1024 }
1025 }
1026 if (!found[nblks]) {
1027 DBG << "Delta XFER: No reusable blocks found for " << filename << std::endl;
1028 return;
1029 }
1030 // now throw out all of the blocks we found
1031 std::vector<MediaBlock> nblocks;
1032 std::vector<unsigned char> nchksums;
1033 std::vector<unsigned int> nrsums;
1034
1035 size_t originalSize = 0;
1036 size_t newSize = 0;
1037 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
1038 {
1039 const auto &blk = blocks[blkno];
1040 originalSize += blk.size;
1041 if (!found[blkno])
1042 {
1043 // still need it
1044 nblocks.push_back(blk);
1045 newSize += blk.size;
1046 if (chksumlen && (blkno + 1) * chksumlen <= chksums.size())
1047 {
1048 nchksums.resize(nblocks.size() * chksumlen);
1049 memcpy(&nchksums[(nblocks.size() - 1) * chksumlen], &chksums[blkno * chksumlen], chksumlen);
1050 }
1051 if (rsumlen && (blkno + 1) <= rsums.size())
1052 nrsums.push_back(rsums[blkno]);
1053 }
1054 }
1055 DBG << "Delta XFER: Found blocks to reuse, " << blocks.size() << " vs " << nblocks.size() << ", resused blocks: " << blocks.size() - nblocks.size() << "\n"
1056 << "Old transfer size: " << originalSize << " new size: " << newSize << std::endl;
1057 blocks = nblocks;
1058 chksums = nchksums;
1059 rsums = nrsums;
1060}
1061
1062std::string
1064{
1065 std::string s;
1066 size_t i = 0, j = 0;
1067
1068 if (filesize != off_t(-1))
1069 {
1070 long long size = filesize;
1071 s = zypp::str::form("[ BlockList, file size %lld\n", size);
1072 }
1073 else
1074 s = "[ BlockList, filesize unknown\n";
1075 if (!haveblocks)
1076 s += " No block information\n";
1077 if (chksumpad)
1078 s += zypp::str::form(" Checksum pad %zd\n", chksumpad);
1079 if (rsumpad)
1080 s += zypp::str::form(" Rsum pad %zd\n", rsumpad);
1081 for (i = 0; i < blocks.size(); ++i)
1082 {
1083 long long off=blocks[i].off;
1084 long long size=blocks[i].size;
1085 s += zypp::str::form(" (%8lld, %8lld)", off, size);
1086 if (chksumlen && chksums.size() >= (i + 1) * chksumlen)
1087 {
1088 s += " " + chksumtype + ":";
1089 for (j = 0; j < size_t(chksumlen); j++)
1090 s += zypp::str::form("%02hhx", chksums[i * chksumlen + j]);
1091 }
1092 if (rsumlen && rsums.size() > i)
1093 {
1094 s += " RSUM:";
1095 s += zypp::str::form("%0*x", 2 * rsumlen, rsums[i]);
1096 }
1097 s += "\n";
1098 }
1099 s += "]";
1100 return s;
1101}
1102
1103 } // namespace media
1104} // namespace zypp
Compute Message Digests (MD5, SHA1 etc)
Definition Digest.h:38
UByteArray digestVector()
get vector of unsigned char representation of the digest
Definition Digest.cc:293
bool update(const char *bytes, size_t len)
feed data into digest computation algorithm
Definition Digest.cc:311
bool create(const std::string &name)
initialize creation of a new message digest
Definition Digest.cc:197
std::vector< unsigned int > rsums
void setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad=0)
set / verify the (weak) rolling checksum over a single block
bool haveChecksum(size_t blkno) const
void setFileChecksum(std::string ctype, int cl, unsigned char *c)
set / verify the checksum over the whole file
bool verifyRsum(size_t blkno, unsigned int rs) const
const UByteArray & getFileChecksum()
void writeBlock(size_t blkno, FILE *fp, const unsigned char *buf, size_t bufl, size_t start, std::vector< bool > &found) const
bool createDigest(Digest &digest) const
std::string asString() const
return block list as string
void setRsumSequence(uint seq)
how many blocks in sequence need to have the correct checksums to be considered a match
UByteArray getChecksum(size_t blkno) const
bool checkChecksumRotated(size_t blkno, const unsigned char *buf, size_t bufl, size_t start) const
std::vector< unsigned char > chksums
size_t addBlock(off_t off, size_t size)
add a block with offset off and size size to the block list.
unsigned int updateRsum(unsigned int rs, const char *bytes, size_t len) const
void reuseBlocks(FILE *wfp, const std::string &filename)
void reuseBlocksOld(FILE *wfp, const std::string &filename)
scan a file for blocks from our blocklist.
bool verifyDigest(size_t blkno, Digest &digest) const
bool checkRsum(size_t blkno, const unsigned char *buf, size_t bufl) const
std::vector< MediaBlock > blocks
std::string getChecksumType() const
bool createFileDigest(Digest &digest) const
bool verifyFileDigest(Digest &digest) const
std::string fileChecksumType() const
bool haveRsum(size_t blkno) const
bool checkChecksum(size_t blkno, const unsigned char *buf, size_t bufl) const
void setChecksum(size_t blkno, const std::string &cstype, int csl, unsigned char *cs, size_t cspad=0)
set / verify the (strong) checksum over a single block
MediaBlockList(off_t filesize=off_t(-1))
#define UPDATE_RSUM(a, b, oldc, newc, bshift)
unsigned short a
unsigned short b
static size_t fetchnext(FILE *fp, unsigned char *bp, size_t blksize, size_t pushback, unsigned char *pushbackp)
std::string form(const char *format,...) __attribute__((format(printf
Printf style construction of std::string.
Definition String.cc:37
Easy-to use interface to the ZYPP dependency resolver.
AutoDispose<FILE*> calling ::fclose
a single block from the blocklist, consisting of an offset and a size
#define DBG
Definition Logger.h:97