SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
construct_sa_se.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
4#ifndef SDSL_CONSTRUCT_SA_SE
5#define SDSL_CONSTRUCT_SA_SE
6
7#include <assert.h>
8#include <cstdint>
9#include <fstream>
10#include <memory>
11#include <stddef.h>
12#include <string>
13#include <utility>
14#include <vector>
15
16#include <sdsl/bits.hpp>
17#include <sdsl/int_vector.hpp>
19#include <sdsl/io.hpp>
21#include <sdsl/ram_fs.hpp>
23#include <sdsl/util.hpp>
24
25namespace sdsl
26{
27
28template <class int_vector_type>
29uint64_t _get_next_lms_position(int_vector_type & text, uint64_t i)
30{
31 if (i >= text.size() - 3)
32 {
33 return text.size() - 1;
34 }
35 // text[i] is S-TYP or L-TYP
36 uint64_t ci = text[i], cip1 = text[i + 1];
37 while (ci <= cip1)
38 {
39 ++i;
40 ci = cip1;
41 cip1 = text[i + 1];
42 }
43 // text[i] is L-TYP or S-TYP
44 uint64_t candidate = i + 1;
45 while (ci >= cip1)
46 {
47 if (ci > cip1)
48 {
49 if (i + 1 == text.size() - 1)
50 {
51 return text.size() - 1;
52 }
53 candidate = i + 1;
54 }
55 ++i;
56 ci = cip1;
57 cip1 = text[i + 1];
58 }
59 return candidate;
60}
61
62inline void _construct_sa_IS(int_vector<> & text,
63 int_vector<> & sa,
64 std::string & filename_sa,
65 size_t n,
66 size_t text_offset,
67 size_t sigma,
68 uint64_t recursion)
69{
70 uint64_t buffersize = 1024 * 1024 / 8;
71
72 size_t name = 0;
73 size_t number_of_lms_strings = 0;
74 std::string filename_c_array = tmp_file(filename_sa, "_c_array" + util::to_string(recursion));
75 // Phase 1
76 {
77 std::vector<uint64_t> bkt(sigma, 0);
78 // Step 1 - Count characters into c array
79 // TODO: better create this in higher recursion-level
80 for (size_t i = 0; i < n; ++i)
81 {
82 ++bkt[text[text_offset + i]];
83 }
84
85 // Step 1.5 save them into cached_external_array
86 int_vector_buffer<> c_array(filename_c_array, std::ios::out, buffersize, 64);
87 for (size_t c = 0; c < sigma; ++c)
88 {
89 c_array[c] = bkt[c];
90 }
91
92 // Step 2 Calculate End-Pointer of Buckets
93 bkt[0] = 0;
94 for (size_t c = 1; c < sigma; ++c)
95 {
96 bkt[c] = bkt[c - 1] + bkt[c];
97 }
98
99 // Step 3 - Insert S*-positions into correct bucket of SA but not in correct order inside the buckets
100 for (size_t i = n - 2, was_s_typ = 1; i < n; --i)
101 {
102 if (text[text_offset + i] > text[text_offset + i + 1])
103 {
104 if (was_s_typ)
105 {
106 sa[bkt[text[text_offset + i + 1]]--] = i + 1;
107 ++number_of_lms_strings;
108 was_s_typ = 0;
109 }
110 }
111 else if (text[text_offset + i] < text[text_offset + i + 1])
112 {
113 was_s_typ = 1;
114 }
115 }
116
117 // Step 4 - Calculate Begin-Pointer of Buckets
118 bkt[0] = 0;
119 for (size_t c = 1; c < sigma; ++c)
120 {
121 bkt[c] = bkt[c - 1] + c_array[c - 1];
122 }
123
124 // Step 5 - Scan from Left-To-Right to induce L-Types
125 for (size_t i = 0; i < n; ++i)
126 {
127 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
128 { // faster than if(sa[i]>0 and bkt_beg[text[ sa[i]-1 ]] > i)
129 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
130 sa[i] = 0;
131 }
132 }
133
134 // Step 6 - Scan from Right-To-Left to induce S-Types
135 bkt[0] = 0;
136 for (size_t c = 1; c < sigma; ++c)
137 {
138 bkt[c] = bkt[c - 1] + c_array[c];
139 }
140 c_array.close();
141
142 for (size_t i = n - 1, endpointer = n; i < n; --i)
143 {
144 if (sa[i] > 0)
145 {
146 if (text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
147 { // faster than if(bkt_end[text[ sa[i]-1 ]] < i)
148 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
149 }
150 else
151 {
152 sa[--endpointer] = sa[i];
153 }
154 sa[i] = 0;
155 }
156 }
157
158 // Step 7 - Determine length of LMS-Strings
159 for (size_t i = n - 2, end = n - 2, was_s_typ = 1; i < n; --i)
160 {
161 if (text[text_offset + i] > text[text_offset + i + 1])
162 {
163 if (was_s_typ)
164 {
165 sa[(i + 1) >> 1] = end - i;
166 end = i + 1;
167 was_s_typ = 0;
168 }
169 }
170 else if (text[text_offset + i] < text[text_offset + i + 1])
171 {
172 was_s_typ = 1;
173 }
174 }
175
176 // Step 8 - Rename
177 for (size_t i = n - number_of_lms_strings + 1, cur_pos = 0, cur_len = 0, last_pos = n - 1, last_len = 1; i < n;
178 ++i)
179 {
180 cur_pos = sa[i];
181 cur_len = sa[(cur_pos >> 1)];
182 if (cur_len == last_len)
183 {
184 size_t l = 0;
185 while (l < cur_len and text[text_offset + cur_pos + l] == text[text_offset + last_pos + l])
186 {
187 ++l;
188 }
189 if (l >= cur_len)
190 {
191 --name;
192 }
193 }
194 sa[(cur_pos >> 1)] = ++name;
195 last_pos = cur_pos;
196 last_len = cur_len;
197 }
198 }
199
200 // Step 9 - Calculate SA of new string - in most cases recursive
201 if (name + 1 < number_of_lms_strings)
202 {
203 // Move Names to the end
204 for (size_t i = 0, t = n - number_of_lms_strings; i < (n >> 1); ++i)
205 {
206 if (sa[i] > 0)
207 {
208 sa[t++] = sa[i];
209 sa[i] = 0;
210 }
211 }
212 sa[n - 1] = 0;
213
214 // Recursive call
215 std::string filename_sa_rec = tmp_file(filename_sa, "_sa_rec" + util::to_string(recursion + 1));
217 sa,
218 filename_sa_rec,
219 number_of_lms_strings,
220 n - number_of_lms_strings,
221 name + 1,
222 recursion + 1);
223
224 for (size_t i = n - 2, endpointer = n - 1, was_s_typ = 1; i < n; --i)
225 {
226 if (text[text_offset + i] > text[text_offset + i + 1])
227 {
228 if (was_s_typ)
229 {
230 sa[endpointer--] = i + 1;
231 was_s_typ = 0;
232 }
233 }
234 else if (text[text_offset + i] < text[text_offset + i + 1])
235 {
236 was_s_typ = 1;
237 }
238 }
239
240 // Sort S*-positions in correct order into SA
241 for (size_t i = 0; i < number_of_lms_strings; ++i)
242 {
243 size_t pos = sa[i];
244 sa[i] = sa[n - number_of_lms_strings + pos];
245 sa[n - number_of_lms_strings + pos] = 0;
246 }
247 }
248 else
249 {
250 // Move s*-Positions to front
251 sa[0] = n - 1;
252 for (size_t i = 1; i < number_of_lms_strings; ++i)
253 {
254 sa[i] = sa[n - number_of_lms_strings + i];
255 sa[n - number_of_lms_strings + i] = 0;
256 }
257 // Clear lex. names
258 for (size_t i = number_of_lms_strings; i < (n >> 1); ++i)
259 {
260 sa[i] = 0;
261 }
262 }
263
264 // Phase 3
265 {
266 // Step 10 - Count characters into c array
267
268 // Step 11 - Calculate End-Pointer of Buckets
269 int_vector_buffer<> c_array(filename_c_array, std::ios::in, buffersize, 64);
270 std::vector<uint64_t> bkt(sigma, 0);
271 for (size_t c = 1; c < sigma; ++c)
272 {
273 bkt[c] = bkt[c - 1] + c_array[c];
274 }
275
276 // Step 12 - Move S*-positions in correct order into SA
277 for (size_t i = number_of_lms_strings - 1; i < n; --i)
278 {
279 size_t pos = sa[i];
280 sa[i] = 0;
281 sa[bkt[text[text_offset + pos]]--] = pos;
282 }
283
284 // Step 13 - Calculate Begin-Pointer of Buckets
285 bkt[0] = 0;
286 for (size_t c = 1; c < sigma; ++c)
287 {
288 bkt[c] = bkt[c - 1] + c_array[c - 1];
289 }
290
291 // Step 14 - Scan from Left-To-Right to induce L-Types
292 for (size_t i = 0; i < n; ++i)
293 {
294 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
295 { // faster than if(sa[i]>0 and bkt_beg[text[ sa[i]-1 ]] > i)
296 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
297 }
298 }
299
300 // Step 15 - Scan from Right-To-Left to induce S-Types
301 bkt[0] = 0;
302 for (size_t c = 1; c < sigma; ++c)
303 {
304 bkt[c] = bkt[c - 1] + c_array[c];
305 }
306 for (size_t i = n - 1; i < n; --i)
307 {
308 if (sa[i] > 0 and text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
309 {
310 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
311 }
312 }
313 c_array.close(true);
314 }
315}
316
317template <class int_vector_type>
318void _construct_sa_se(int_vector_type & text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
319{
320 std::string filename_text = tmp_file(filename_sa, "_text_rec" + util::to_string(recursion));
321 store_to_file(text, filename_text);
322 uint64_t n = text.size();
323 uint64_t nsize = bits::hi(n) + 1;
324 uint8_t int_width = bits::hi(n - 1) + 1;
325 uint64_t buffersize = 1024 * 1024 / 8;
326
327 // Step 1 - Scan Text from right to left and count LMS, S and L characters and store lms_positions
328
329 // Define variables
330 size_t first_lms_pos = 0;
331 size_t number_of_lms_strings = 0;
332 size_t bkt_s_last = 0, bkt_s_sum = 0, bound_s = 0, bkt_l_sum = 0;
333 int_vector<> C(sigma, 0, int_width);
334 int_vector<> bkt_lms(sigma, 0, int_width);
335 int_vector<> bkt_s(sigma, 0, int_width);
336 int_vector<> bkt_l(sigma, 0, int_width);
337 std::string filename_lms_pos_b = tmp_file(filename_sa, "_lms_pos_b" + util::to_string(recursion));
338 size_t parts = 10;
339
340 {
341 int_vector_buffer<1> lms_pos_b(filename_lms_pos_b, std::ios::out, buffersize, 1);
342 uint64_t ci = text[n - 1];
343 ++C[ci];
344 bool was_s_typ = 1;
345 for (size_t i = n - 2; i < n; --i)
346 {
347 uint64_t cip1 = ci;
348 ci = text[i];
349 ++C[ci];
350 if (was_s_typ)
351 {
352 ++bkt_s[text[i + 1]];
353 if (ci > cip1)
354 {
355 ++bkt_lms[cip1];
356 lms_pos_b[i + 1] = 1;
357 ++number_of_lms_strings;
358 first_lms_pos = i + 1;
359 was_s_typ = 0;
360 }
361 }
362 else if (ci < cip1)
363 {
364 was_s_typ = 1;
365 }
366 }
367 if (was_s_typ)
368 {
369 ++bkt_s[ci];
370 }
371 bkt_l[0] = C[0] - bkt_s[0];
372 for (size_t i = 1; i < C.size(); ++i)
373 {
374 bkt_l[i] = C[i] - bkt_s[i];
375 C[i] = C[i] + C[i - 1];
376 }
377 lms_pos_b.close();
378 }
379
380 // Step 2 - Scan Text from right to left and detect LMS-Positions. Sort and write them to disk
381 int_vector_buffer<> right(tmp_file(filename_sa, "_right" + util::to_string(recursion)),
382 std::ios::out,
383 buffersize,
384 nsize);
385 size_t right_pointer = 0;
386 int_vector_buffer<> left(tmp_file(filename_sa, "_left" + util::to_string(recursion)),
387 std::ios::out,
388 buffersize,
389 nsize);
390 size_t left_pointer = 0;
391 {
392 for (size_t i = 0, tmp2 = 0, tmp = 0; i < sigma; ++i)
393 {
394 tmp += bkt_lms[i];
395 bkt_lms[i] = tmp2;
396 tmp2 = tmp;
397 }
398 int_vector_buffer<> lms_positions(tmp_file(filename_sa, "_lms_positions" + util::to_string(recursion)),
399 std::ios::out,
400 buffersize,
401 nsize);
402 for (size_t i = n - 2, was_s_typ = 1, ci = text[n - 1]; i < n; --i)
403 {
404 uint64_t cip1 = ci;
405 ci = text[i];
406 if (ci > cip1)
407 {
408 if (was_s_typ)
409 {
410 lms_positions.push_back(bkt_lms[cip1]);
411 lms_positions.push_back(i + 1);
412 ++bkt_lms[cip1];
413 was_s_typ = 0;
414 }
415 }
416 else if (ci < cip1)
417 {
418 was_s_typ = 1;
419 }
420 }
421 util::clear(text);
422 {
423 // Order lms_positions according to first character
424 int_vector<> lms_strings(number_of_lms_strings, 0, int_width);
425 for (size_t i = 0; i < lms_positions.size();)
426 {
427 size_t idx = lms_positions[i++];
428 size_t val = lms_positions[i++];
429 lms_strings[idx] = val;
430 }
431 lms_positions.close(true);
432 // Store it to file
433 left_pointer = 0;
434 for (size_t i = 0; i < number_of_lms_strings; ++i)
435 {
436 left[left_pointer++] = lms_strings[number_of_lms_strings - i - 1];
437 }
438 }
439 load_from_file(text, filename_text);
440 }
441 left_pointer--;
442
443 // Step 3 - Scan virtual array from left to right
444 {
445 // prepare bkt_l and backup it into bkt_lms
446 for (size_t i = 0, tmp = 0; i < sigma; ++i)
447 {
448 tmp = bkt_l[i];
449 bkt_l[i] = bkt_l_sum;
450 bkt_l_sum += tmp;
451 bkt_lms[i] = bkt_l[i];
452 }
453
454 size_t partsize = bkt_l_sum / parts + 1;
455
456 int_vector<> array(partsize, 0, int_width);
457 std::vector<int_vector_buffer<>> cached_array(parts - 1);
458 for (size_t i = 0; i < cached_array.size(); ++i)
459 {
460 cached_array[i] = int_vector_buffer<>(
461 tmp_file(filename_sa, "_rightbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
462 std::ios::out,
463 buffersize,
464 nsize);
465 }
466
467 for (size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
468 {
469 // begin with array
470 for (; pos < bkt_l[c]; ++pos)
471 {
472 // Load lazy values
473 if (pos - offset >= partsize)
474 {
475 offset += partsize;
476 for (size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
477 {
478 size_t src = cached_array[cur_part][i++];
479 size_t val = cached_array[cur_part][i++];
480 array[src - offset] = val;
481 }
482 cached_array[pos / partsize - 1].reset();
483 }
484
485 size_t idx = array[pos - offset];
486 if (idx == 0)
487 {
488 right[right_pointer++] = idx;
489 }
490 else
491 {
492 size_t symbol = text[idx - 1];
493 if (symbol >= c)
494 {
495 size_t val = idx - 1;
496 size_t src = bkt_l[symbol];
497 bkt_l[symbol] = bkt_l[symbol] + 1;
498 if ((src - offset) / partsize == 0)
499 {
500 array[src - offset] = val;
501 }
502 else
503 {
504 size_t part = src / partsize - 1;
505 cached_array[part].push_back(src);
506 cached_array[part].push_back(val);
507 }
508 }
509 else
510 {
511 right[right_pointer++] = idx;
512 }
513 }
514 }
515 // continue with stack
516 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
517 {
518 size_t idx = left[left_pointer--];
519 --idx;
520 size_t symbol = text[idx];
521
522 size_t val = idx;
523 size_t src = bkt_l[symbol];
524 bkt_l[symbol] = bkt_l[symbol] + 1;
525 if ((src - offset) / partsize == 0)
526 {
527 array[src - offset] = val;
528 }
529 else
530 {
531 size_t part = src / partsize - 1;
532 cached_array[part].push_back(src);
533 cached_array[part].push_back(val);
534 }
535 }
536 }
537 for (size_t i = 0; i < cached_array.size(); ++i)
538 {
539 cached_array[i].close(true);
540 }
541
542 // Restore bkt_l from bkt_lms
543 for (size_t i = 0; i < sigma; ++i)
544 {
545 bkt_l[i] = bkt_lms[i];
546 }
547 }
548 right_pointer--;
549
550 // Step 4 - Scan virtual array from right to left
551 left_pointer = 0;
552 left.reset();
553 {
554 // Prepare bkt_s and backup it into bkt_lms
555 bkt_s_last = 0, bkt_s_sum = 0;
556 for (size_t i = 0; i < sigma; ++i)
557 {
558 bkt_s_sum += bkt_s[i];
559 if (bkt_s[i])
560 {
561 bkt_s[i] = bkt_s_sum;
562 bkt_s_last = bkt_s_sum;
563 }
564 else
565 {
566 bkt_s[i] = bkt_s_sum;
567 }
568 bkt_lms[i] = bkt_s[i];
569 }
570 bound_s = bkt_s_sum;
571
572 // Determine splitting parameters
573 for (size_t i = 0; i < sigma; ++i)
574 {
575 if (bkt_s[i] > bkt_s_sum / 2)
576 {
577 bkt_s_sum = bkt_s[i];
578 break;
579 }
580 }
581
582 size_t partsize = bound_s / parts + 1;
583 int_vector<> array(partsize, 0, int_width);
584 std::vector<int_vector_buffer<>> cached_array(parts - 1);
585 for (size_t i = 0; i < cached_array.size(); ++i)
586 {
587 cached_array[i] = int_vector_buffer<>(
588 tmp_file(filename_sa, "_leftbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
589 std::ios::out,
590 buffersize,
591 nsize);
592 }
593 for (size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
594 {
595 // begin with array
596 for (; pos + 1 > bkt_s[c]; --pos)
597 {
598 while (pos < offset)
599 {
600 // Load buffered values
601 offset -= partsize;
602 for (size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
603 {
604 size_t src = cached_array[cur_part][i++];
605 size_t val = cached_array[cur_part][i++];
606 array[src - offset] = val;
607 }
608 cached_array[offset / partsize].reset();
609 }
610
611 size_t idx = array[pos - offset];
612 if (idx == 0)
613 {
614 idx = n;
615 }
616 --idx;
617 size_t symbol = text[idx];
618 if (symbol <= c)
619 {
620 bkt_s[symbol] = bkt_s[symbol] - 1;
621 size_t val = idx;
622 size_t src = bkt_s[symbol];
623 if (src >= offset)
624 {
625 array[src - offset] = val;
626 }
627 else
628 {
629 size_t part = src / partsize;
630 cached_array[part].push_back(src);
631 cached_array[part].push_back(val);
632 }
633 }
634 else
635 {
636 left[left_pointer++] = array[pos - offset];
637 }
638 }
639
640 // continue with stack
641 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
642 {
643 size_t idx = right[right_pointer--];
644 if (idx == 0)
645 {
646 idx = n;
647 }
648 --idx;
649 size_t symbol = text[idx];
650 bkt_s[symbol] = bkt_s[symbol] - 1;
651
652 size_t val = idx;
653 size_t src = bkt_s[symbol];
654 if (src >= offset)
655 {
656 array[src - offset] = val;
657 }
658 else
659 {
660 size_t part = src / partsize;
661 cached_array[part].push_back(src);
662 cached_array[part].push_back(val);
663 }
664 }
665 }
666 for (size_t i = 0; i < cached_array.size(); ++i)
667 {
668 cached_array[i].close(true);
669 }
670 // Restore bkt_s from bkt_lms
671 for (size_t i = 0; i < sigma; ++i)
672 {
673 bkt_s[i] = bkt_lms[i];
674 }
675 }
676 right.buffersize(0);
677 right.reset();
678 right_pointer = 0;
679 --left_pointer;
680
681 // Step 5 - Detect same lms-Strings, write text to file
682 int_vector<1> same_lms(number_of_lms_strings, false);
683 size_t last_end_pos = first_lms_pos, order = number_of_lms_strings - 1;
684 same_lms[number_of_lms_strings - 1] = true;
685 for (size_t i = number_of_lms_strings - 2, a = 0, b = 0, last_a = left[number_of_lms_strings - 1];
686 i < number_of_lms_strings;
687 --i)
688 {
689 b = last_a;
690 a = left[i];
691 last_a = a;
692
693 size_t end_pos = _get_next_lms_position(text, a);
694 if (end_pos - a == last_end_pos - b)
695 {
696 while (a < end_pos and text[a] == text[b])
697 {
698 ++a;
699 ++b;
700 }
701 if (text[a] == text[b])
702 {
703 same_lms[i] = true;
704 --order;
705 }
706 }
707 last_end_pos = end_pos;
708 }
709 util::clear(text);
710
711 // Step 7 - Create renamed string
712 int_vector<> text_rec;
713 if (recursion == 0)
714 {
715 text_rec.width((bits::hi(order + 1) + 1));
716 }
717 else
718 {
719 text_rec.width((bits::hi(number_of_lms_strings + 1) + 1));
720 }
721 text_rec.resize(number_of_lms_strings);
722 util::_set_zero_bits(text_rec);
723 {
724 if (recursion == 0 and n / 2 * text_rec.width() > 8 * n)
725 {
726 size_t size_of_part = n / 4 + 3;
727 text_rec.resize(size_of_part);
728 util::_set_zero_bits(text_rec);
729 order = 0;
730 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
731 {
732 if (!same_lms[i])
733 {
734 ++order;
735 }
736 if (left[i] / 2 >= size_of_part)
737 {
738 text_rec[(left[i] / 2) - size_of_part] = order;
739 }
740 }
741 std::string filename_text_rec_part2 = tmp_file(filename_sa, "_text_rec_part2" + util::to_string(recursion));
742 size_t pos = 0;
743 for (size_t i = 0; i < size_of_part; ++i)
744 {
745 if (text_rec[i] > 0)
746 {
747 text_rec[pos++] = text_rec[i];
748 }
749 }
750 text_rec.resize(pos);
751 store_to_file(text_rec, filename_text_rec_part2);
752 text_rec.resize(size_of_part);
753 util::_set_zero_bits(text_rec);
754 order = 0;
755 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
756 {
757 if (!same_lms[i])
758 {
759 ++order;
760 }
761 if (left[i] / 2 < size_of_part)
762 {
763 text_rec[left[i] / 2] = order;
764 }
765 }
766 pos = 0;
767 for (size_t i = 0; i < size_of_part; ++i)
768 {
769 if (text_rec[i] > 0)
770 {
771 text_rec[pos++] = text_rec[i];
772 }
773 }
774 text_rec.resize(number_of_lms_strings);
775 int_vector_buffer<> buf(filename_text_rec_part2, std::ios::in, 1024 * 1024);
776 for (size_t i = 0; i < buf.size(); ++i)
777 {
778 text_rec[pos++] = buf[i];
779 }
780 buf.close(true);
781 text_rec[number_of_lms_strings - 1] = 0;
782 }
783 else
784 {
785 text_rec.resize(n / 2 + 1);
786 util::_set_zero_bits(text_rec);
787 order = 0;
788 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
789 {
790 if (!same_lms[i])
791 {
792 ++order;
793 }
794 text_rec[left[left_pointer--] / 2] = order;
795 }
796 for (size_t i = 0, pos = 0; i < text_rec.size(); ++i)
797 {
798 if (text_rec[i] > 0)
799 {
800 text_rec[pos++] = text_rec[i];
801 }
802 }
803 text_rec[number_of_lms_strings - 1] = 0;
804 text_rec.resize(number_of_lms_strings);
805 }
806 }
807 util::clear(same_lms);
808 left.buffersize(0);
809 left.reset();
810
811 // Step 8 - Determine complete LMS order (recursivly)
812 int_vector<> isa_rec;
813 std::string filename_sa_rec = tmp_file(filename_sa, "_sa_rec" + util::to_string(recursion + 1));
814 if (text_rec.size() > order + 1)
815 {
816 if (recursion == 0)
817 {
818 memory_monitor::event("begin _construct_sa");
819 _construct_sa_se<int_vector<>>(text_rec, filename_sa_rec, order + 1, recursion + 1);
820 memory_monitor::event("end _construct_sa");
821 }
822 else
823 {
824 text_rec.resize(text_rec.size() * 2);
825 for (size_t i = 0; i < number_of_lms_strings; ++i)
826 {
827 text_rec[number_of_lms_strings + i] = text_rec[i];
828 text_rec[i] = 0;
829 }
830 memory_monitor::event("begin sa_simple");
831 _construct_sa_IS(text_rec,
832 text_rec,
833 filename_sa_rec,
834 number_of_lms_strings,
835 number_of_lms_strings,
836 order + 1,
837 recursion + 1);
838 memory_monitor::event("end sa_simple");
839 // SA' in first half, S' in second half
840 text_rec.resize(number_of_lms_strings);
841 store_to_file(text_rec, filename_sa_rec);
842 }
843 }
844 else
845 {
846 isa_rec = std::move(text_rec);
847 }
848
849 // Step 9 - Initiate left for scan in step 12
850 if (isa_rec.size() > 0)
851 {
852 // isa_rec exists //TODO test if its better to create sa_rec
853 // TODO always enough memory? in memory: isa_rec, lms_pos_b, select_support, tmp_left, leftbuffer
854 // load bit_vector lms_positions and create select support
855 bit_vector lms_pos_b(n);
856 load_from_file(lms_pos_b, filename_lms_pos_b);
857 sdsl::remove(filename_lms_pos_b);
858 select_support_mcl<> lms_select_support; // select_support for bit_vector
859 util::init_support(lms_select_support, &lms_pos_b); // Create select_support
860 // write left
861 int_vector<> tmp_left(number_of_lms_strings, 0, int_width);
862 for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
863 {
864 size_t idx = isa_rec[i];
865 size_t val = lms_select_support.select(i + 1); // TODO test alternative without select support: look for 1
866 // in lms_pos_b (backwards)
867 tmp_left[idx] = val;
868 }
869 util::clear(lms_select_support);
870 util::clear(lms_pos_b);
871 util::clear(isa_rec);
872 // write to left
873 left.buffersize(buffersize);
874 left_pointer = 0;
875 for (; left_pointer < number_of_lms_strings; ++left_pointer)
876 {
877 left[left_pointer] = tmp_left[number_of_lms_strings - left_pointer - 1];
878 }
879 left_pointer--;
880 util::clear(tmp_left);
881 }
882 else
883 {
884 left.buffersize(buffersize);
885 left_pointer = 0;
886 {
887 // load bit_vector lms_positions and create select support
888 bit_vector lms_pos_b(n);
889 load_from_file(lms_pos_b, filename_lms_pos_b);
890 sdsl::remove(filename_lms_pos_b);
891 select_support_mcl<> lms_select_support; // select_support for bit_vector
892 util::init_support(lms_select_support, &lms_pos_b); // create select_support
893 // write to left sa_rec buffered
894 int_vector_buffer<> sa_rec_buf(filename_sa_rec, std::ios::in, buffersize, nsize);
895 for (uint64_t i = 0; i < sa_rec_buf.size(); ++i)
896 {
897 uint64_t pos = lms_select_support.select(sa_rec_buf[i] + 1);
898 left[number_of_lms_strings - 1 - left_pointer++] = pos;
899 }
900 sa_rec_buf.close(true);
901 left_pointer--;
902 }
903 // TODO test sa_rec unbuffered in recursion level 1 -> space still good?
904 }
905
906 // Step 10 - Reload text
907 load_from_file(text, filename_text);
908 sdsl::remove(filename_text);
909
910 // Step 12 - Scan virtual array from left to right second time
911 right.buffersize(buffersize);
912 right_pointer = 0;
913 int_vector_buffer<> cached_sa(filename_sa, std::ios::out, buffersize, nsize);
914 size_t sa_pointer = 0;
915 {
916 size_t partsize = bkt_l_sum / parts + 1;
917 int_vector<> array(partsize, 0, int_width);
918 std::vector<int_vector_buffer<>> cached_array(parts - 1);
919 for (size_t i = 0; i < cached_array.size(); ++i)
920 {
921 cached_array[i] = int_vector_buffer<>(
922 tmp_file(filename_sa, "_rightbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
923 std::ios::out,
924 buffersize,
925 nsize);
926 }
927
928 for (size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
929 {
930 // begin with array
931 for (; pos < bkt_l[c]; ++pos)
932 {
933 // Load lazy values
934 if (pos - offset >= partsize)
935 {
936 offset += partsize;
937 for (size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
938 {
939 size_t src = cached_array[cur_part][i++];
940 size_t val = cached_array[cur_part][i++];
941 array[src - offset] = val;
942 }
943 cached_array[pos / partsize - 1].reset();
944 }
945 size_t idx = array[pos - offset];
946 if (idx == 0)
947 {
948 cached_sa[sa_pointer++] = idx;
949 right[right_pointer++] = idx;
950 }
951 else
952 {
953 size_t symbol = text[idx - 1];
954 cached_sa[sa_pointer++] = idx;
955 if (symbol >= c)
956 {
957 size_t val = idx - 1;
958 size_t src = bkt_l[symbol];
959 bkt_l[symbol] = bkt_l[symbol] + 1;
960 if ((src - offset) / partsize == 0)
961 {
962 array[src - offset] = val;
963 }
964 else
965 {
966 size_t part = src / partsize - 1;
967 cached_array[part].push_back(src);
968 cached_array[part].push_back(val);
969 }
970 }
971 else
972 {
973 right[right_pointer++] = idx;
974 }
975 }
976 }
977 sa_pointer = C[c];
978 // continue with stack
979 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
980 {
981 size_t idx = left[left_pointer--];
982 if (idx == 0)
983 {
984 idx = n;
985 }
986 --idx;
987 size_t symbol = text[idx];
988 size_t val = idx;
989 size_t src = bkt_l[symbol];
990 bkt_l[symbol] = bkt_l[symbol] + 1;
991 if ((src - offset) / partsize == 0)
992 {
993 array[src - offset] = val;
994 }
995 else
996 {
997 size_t part = src / partsize - 1;
998 cached_array[part].push_back(src);
999 cached_array[part].push_back(val);
1000 }
1001 }
1002 }
1003 for (size_t i = 0; i < cached_array.size(); ++i)
1004 {
1005 cached_array[i].close(true);
1006 }
1007 }
1008 left.close(true);
1009 right_pointer--;
1010
1011 // Step 13 - Scan virtual array from right to left second time
1012 {
1013 size_t partsize = bound_s / parts + 1;
1014
1015 int_vector<> array(partsize, 0, int_width);
1016 std::vector<int_vector_buffer<>> cached_array(parts - 1);
1017 for (size_t i = 0; i < cached_array.size(); ++i)
1018 {
1019 cached_array[i] = int_vector_buffer<>(
1020 tmp_file(filename_sa, "_leftbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
1021 std::ios::out,
1022 buffersize,
1023 nsize);
1024 // cached_array_pointer[i] = 0;
1025 }
1026 for (size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
1027 {
1028 // begin with array
1029 assert(c < C.size());
1030 sa_pointer = C[c] - 1;
1031 for (; pos + 1 > bkt_s[c]; --pos)
1032 {
1033 while (pos < offset)
1034 {
1035 // Load buffered values
1036 offset -= partsize;
1037 for (size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
1038 {
1039 size_t src = cached_array[cur_part][i++];
1040 size_t val = cached_array[cur_part][i++];
1041 assert((src - offset) < array.size());
1042 array[src - offset] = val;
1043 }
1044 assert((offset / partsize) < parts - 1);
1045 cached_array[offset / partsize].reset();
1046 }
1047
1048 assert((pos - offset) < array.size());
1049 size_t idx = array[pos - offset];
1050 if (idx == 0)
1051 {
1052 idx = n;
1053 }
1054 --idx;
1055 assert((idx) < text.size());
1056 size_t symbol = text[idx];
1057 if (symbol <= c)
1058 {
1059 if (idx == n - 1)
1060 {
1061 cached_sa[sa_pointer--] = 0;
1062 }
1063 else
1064 {
1065 cached_sa[sa_pointer--] = idx + 1;
1066 }
1067 assert((symbol) < bkt_s.size());
1068 bkt_s[symbol] = bkt_s[symbol] - 1;
1069 size_t val = idx;
1070 size_t src = bkt_s[symbol];
1071 if (src >= offset)
1072 {
1073 assert((src - offset) < array.size());
1074 array[src - offset] = val;
1075 }
1076 else
1077 {
1078 size_t part = src / partsize;
1079 assert(part < parts - 1);
1080 cached_array[part].push_back(src);
1081 cached_array[part].push_back(val);
1082 }
1083 }
1084 else
1085 {
1086 if (idx == n - 1)
1087 {
1088 cached_sa[sa_pointer--] = 0;
1089 }
1090 else
1091 {
1092 cached_sa[sa_pointer--] = idx + 1;
1093 }
1094 }
1095 }
1096 // continue with stack
1097 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
1098 {
1099 size_t idx = right[right_pointer--];
1100 if (idx == 0)
1101 {
1102 idx = n;
1103 }
1104 --idx;
1105 size_t symbol = text[idx];
1106 assert((symbol) < bkt_s.size());
1107 bkt_s[symbol] = bkt_s[symbol] - 1;
1108
1109 size_t val = idx;
1110 size_t src = bkt_s[symbol];
1111 if (src >= offset)
1112 {
1113 assert((src - offset) < array.size());
1114 array[src - offset] = val;
1115 }
1116 else
1117 {
1118 size_t part = src / partsize;
1119 assert((part) < parts - 1);
1120 cached_array[part].push_back(src);
1121 cached_array[part].push_back(val);
1122 }
1123 }
1124 }
1125 for (size_t i = 0; i < cached_array.size(); ++i)
1126 {
1127 cached_array[i].close(true);
1128 }
1129 }
1130 right.close(true);
1131 cached_sa.close();
1132
1133 return;
1134}
1135
1136} // namespace sdsl
1137
1138#endif
bits.hpp contains the sdsl::bits class.
uint64_t size() const
Returns the number of elements currently stored.
void reset()
Delete all content and set size to 0.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
A generic vector class for integers of width .
Definition io.hpp:36
size_type size() const noexcept
The number of elements in the int_vector.
void resize(const size_type size)
Resize the int_vector in terms of elements.
void push_back(value_type value)
Insert element at the end.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
static mm_event_proxy event(std::string const &name)
A class supporting constant time select queries.
size_type select(size_type i) const
Select function.
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
memory_tracking.hpp contains two function for allocating and deallocating memory
std::string to_string(T const &t, int w=1)
void _set_zero_bits(t_int_vec &v)
Sets all bits of the int_vector to 0-bits.
Definition util.hpp:552
Namespace for the succinct data structure library.
void _construct_sa_IS(int_vector<> &text, int_vector<> &sa, std::string &filename_sa, size_t n, size_t text_offset, size_t sigma, uint64_t recursion)
int remove(std::string const &)
Remove a file.
Definition ram_fs.hpp:221
std::string tmp_file(cache_config const &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition io.hpp:759
uint64_t _get_next_lms_position(int_vector_type &text, uint64_t i)
bool load_from_file(T &v, std::string const &file)
Load sdsl-object v from a file.
Definition io.hpp:989
void _construct_sa_se(int_vector_type &text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
bool store_to_file(T const &v, std::string const &file)
Store a data structure to a file.
Definition io.hpp:874
ram_fs.hpp
select_support_mcl.hpp contains classes that support a sdsl::bit_vector with constant time select inf...
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:653
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.