|  | /* | 
|  | * Copyright (C) 2019 The Android Open Source Project | 
|  | * | 
|  | * Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | * you may not use this file except in compliance with the License. | 
|  | * You may obtain a copy of the License at | 
|  | * | 
|  | *      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | * | 
|  | * Unless required by applicable law or agreed to in writing, software | 
|  | * distributed under the License is distributed on an "AS IS" BASIS, | 
|  | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | * See the License for the specific language governing permissions and | 
|  | * limitations under the License. | 
|  | */ | 
|  |  | 
|  | #include "src/trace_processor/containers/bit_vector.h" | 
|  |  | 
|  | #include <limits> | 
|  |  | 
|  | #include "protos/perfetto/trace_processor/serialization.pbzero.h" | 
|  | #include "src/trace_processor/containers/bit_vector_iterators.h" | 
|  |  | 
|  | #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT) | 
|  | #include <immintrin.h> | 
|  | #endif | 
|  |  | 
|  | namespace perfetto { | 
|  | namespace trace_processor { | 
|  | namespace { | 
|  |  | 
|  | // This function implements the PDEP instruction in x64 as a loop. | 
|  | // See https://www.felixcloutier.com/x86/pdep for details on what PDEP does. | 
|  | // | 
|  | // Unfortunately, as we're emulating this in software, it scales with the number | 
|  | // of set bits in |mask| rather than being a constant time instruction: | 
|  | // therefore, this should be avoided where real instructions are available. | 
|  | uint64_t PdepSlow(uint64_t word, uint64_t mask) { | 
|  | if (word == 0 || mask == std::numeric_limits<uint64_t>::max()) | 
|  | return word; | 
|  |  | 
|  | // This algorithm is for calculating PDEP was found to be the fastest "simple" | 
|  | // one among those tested when writing this function. | 
|  | uint64_t result = 0; | 
|  | for (uint64_t bb = 1; mask; bb += bb) { | 
|  | if (word & bb) { | 
|  | // MSVC doesn't like -mask so work around this by doing 0 - mask. | 
|  | result |= mask & (0ull - mask); | 
|  | } | 
|  | mask &= mask - 1; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | // See |PdepSlow| for information on PDEP. | 
|  | uint64_t Pdep(uint64_t word, uint64_t mask) { | 
|  | #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT) | 
|  | base::ignore_result(PdepSlow); | 
|  | return _pdep_u64(word, mask); | 
|  | #else | 
|  | return PdepSlow(word, mask); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | BitVector::BitVector() = default; | 
|  |  | 
|  | BitVector::BitVector(std::initializer_list<bool> init) { | 
|  | for (bool x : init) { | 
|  | if (x) { | 
|  | AppendTrue(); | 
|  | } else { | 
|  | AppendFalse(); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | BitVector::BitVector(uint32_t count, bool value) { | 
|  | Resize(count, value); | 
|  | } | 
|  |  | 
|  | BitVector::BitVector(std::vector<uint64_t> words, | 
|  | std::vector<uint32_t> counts, | 
|  | uint32_t size) | 
|  | : size_(size), counts_(std::move(counts)), words_(std::move(words)) { | 
|  | PERFETTO_CHECK(words_.size() % Block::kWords == 0); | 
|  | } | 
|  |  | 
|  | void BitVector::Resize(uint32_t new_size, bool filler) { | 
|  | uint32_t old_size = size_; | 
|  | if (new_size == old_size) | 
|  | return; | 
|  |  | 
|  | // Empty bitvectors should be memory efficient so we don't keep any data | 
|  | // around in the bitvector. | 
|  | if (new_size == 0) { | 
|  | words_.clear(); | 
|  | counts_.clear(); | 
|  | size_ = 0; | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Compute the address of the new last bit in the bitvector. | 
|  | Address last_addr = IndexToAddress(new_size - 1); | 
|  | uint32_t old_blocks_size = static_cast<uint32_t>(counts_.size()); | 
|  | uint32_t new_blocks_size = last_addr.block_idx + 1; | 
|  |  | 
|  | // Resize the block and count vectors to have the correct number of entries. | 
|  | words_.resize(Block::kWords * new_blocks_size); | 
|  | counts_.resize(new_blocks_size); | 
|  |  | 
|  | if (new_size > old_size) { | 
|  | if (filler) { | 
|  | // If the new space should be filled with ones, then set all the bits | 
|  | // between the address of the old size and the new last address. | 
|  | const Address& start = IndexToAddress(old_size); | 
|  | Set(start, last_addr); | 
|  |  | 
|  | // We then need to update the counts vector to match the changes we | 
|  | // made to the blocks. | 
|  |  | 
|  | // We start by adding the bits we set in the first block to the | 
|  | // cummulative count before the range we changed. | 
|  | Address end_of_block = {start.block_idx, | 
|  | {Block::kWords - 1, BitWord::kBits - 1}}; | 
|  | uint32_t count_in_block_after_end = | 
|  | AddressToIndex(end_of_block) - AddressToIndex(start) + 1; | 
|  | uint32_t set_count = CountSetBits() + count_in_block_after_end; | 
|  |  | 
|  | for (uint32_t i = start.block_idx + 1; i <= last_addr.block_idx; ++i) { | 
|  | // Set the count to the cummulative count so far. | 
|  | counts_[i] = set_count; | 
|  |  | 
|  | // Add a full block of set bits to the count. | 
|  | set_count += Block::kBits; | 
|  | } | 
|  | } else { | 
|  | // If the newly added bits are false, we just need to update the | 
|  | // counts vector with the current size of the bitvector for all | 
|  | // the newly added blocks. | 
|  | if (new_blocks_size > old_blocks_size) { | 
|  | uint32_t count = CountSetBits(); | 
|  | for (uint32_t i = old_blocks_size; i < new_blocks_size; ++i) { | 
|  | counts_[i] = count; | 
|  | } | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // Throw away all the bits after the new last bit. We do this to make | 
|  | // future lookup, append and resize operations not have to worrying about | 
|  | // trailing garbage bits in the last block. | 
|  | BlockFromIndex(last_addr.block_idx).ClearAfter(last_addr.block_offset); | 
|  | } | 
|  |  | 
|  | // Actually update the size. | 
|  | size_ = new_size; | 
|  | } | 
|  |  | 
|  | BitVector BitVector::Copy() const { | 
|  | return BitVector(words_, counts_, size_); | 
|  | } | 
|  |  | 
|  | BitVector::AllBitsIterator BitVector::IterateAllBits() const { | 
|  | return AllBitsIterator(this); | 
|  | } | 
|  |  | 
|  | BitVector::SetBitsIterator BitVector::IterateSetBits() const { | 
|  | return SetBitsIterator(this); | 
|  | } | 
|  |  | 
|  | void BitVector::Not() { | 
|  | for (uint32_t i = 0; i < words_.size(); ++i) { | 
|  | BitWord(&words_[i]).Not(); | 
|  | } | 
|  |  | 
|  | for (uint32_t i = 1; i < counts_.size(); ++i) { | 
|  | counts_[i] = kBitsInBlock * i - counts_[i]; | 
|  | } | 
|  | } | 
|  |  | 
|  | void BitVector::Or(const BitVector& sec) { | 
|  | PERFETTO_CHECK(size_ == sec.size()); | 
|  | for (uint32_t i = 0; i < words_.size(); ++i) { | 
|  | BitWord(&words_[i]).Or(sec.words_[i]); | 
|  | } | 
|  |  | 
|  | for (uint32_t i = 1; i < counts_.size(); ++i) { | 
|  | counts_[i] = counts_[i - 1] + | 
|  | ConstBlock(&words_[Block::kWords * (i - 1)]).CountSetBits(); | 
|  | } | 
|  | } | 
|  |  | 
|  | void BitVector::And(const BitVector& sec) { | 
|  | Resize(std::min(size_, sec.size_)); | 
|  | for (uint32_t i = 0; i < words_.size(); ++i) { | 
|  | BitWord(&words_[i]).And(sec.words_[i]); | 
|  | } | 
|  |  | 
|  | for (uint32_t i = 1; i < counts_.size(); ++i) { | 
|  | counts_[i] = counts_[i - 1] + | 
|  | ConstBlock(&words_[Block::kWords * (i - 1)]).CountSetBits(); | 
|  | } | 
|  | } | 
|  |  | 
|  | void BitVector::UpdateSetBits(const BitVector& update) { | 
|  | if (update.CountSetBits() == 0 || CountSetBits() == 0) { | 
|  | *this = BitVector(); | 
|  | return; | 
|  | } | 
|  | PERFETTO_DCHECK(update.size() <= CountSetBits()); | 
|  |  | 
|  | // Get the start and end ptrs for the current bitvector. | 
|  | // Safe because of the static_assert above. | 
|  | uint64_t* ptr = words_.data(); | 
|  | const uint64_t* ptr_end = ptr + WordCount(size()); | 
|  |  | 
|  | // Get the start and end ptrs for the update bitvector. | 
|  | // Safe because of the static_assert above. | 
|  | const uint64_t* update_ptr = update.words_.data(); | 
|  | const uint64_t* update_ptr_end = update_ptr + WordCount(update.size()); | 
|  |  | 
|  | // |update_unused_bits| contains |unused_bits_count| bits at the bottom | 
|  | // which indicates how the next |unused_bits_count| set bits in |this| | 
|  | // should be changed. This is necessary because word boundaries in |this| will | 
|  | // almost always *not* match the word boundaries in |update|. | 
|  | uint64_t update_unused_bits = 0; | 
|  | uint8_t unused_bits_count = 0; | 
|  |  | 
|  | // The basic premise of this loop is, for each word in |this| we find | 
|  | // enough bits from |update| to cover every set bit in the word. We then use | 
|  | // the PDEP x64 instruction (or equivalent instructions/software emulation) to | 
|  | // update the word and store it back in |this|. | 
|  | for (; ptr != ptr_end; ++ptr) { | 
|  | uint64_t current = *ptr; | 
|  |  | 
|  | // If the current value is all zeros, there's nothing to update. | 
|  | if (PERFETTO_UNLIKELY(current == 0)) | 
|  | continue; | 
|  |  | 
|  | uint8_t popcount = static_cast<uint8_t>(PERFETTO_POPCOUNT(current)); | 
|  | PERFETTO_DCHECK(popcount >= 1); | 
|  |  | 
|  | // Check if we have enough unused bits from the previous iteration - if so, | 
|  | // we don't need to read anything from |update|. | 
|  | uint64_t update_for_current = update_unused_bits; | 
|  | if (unused_bits_count >= popcount) { | 
|  | // We have enough bits so just do the accounting to not reuse these bits | 
|  | // for the future. | 
|  | unused_bits_count -= popcount; | 
|  | update_unused_bits = popcount == 64 ? 0 : update_unused_bits >> popcount; | 
|  | } else { | 
|  | // We don't have enough bits so we need to read the next word of bits from | 
|  | // |current|. | 
|  | uint64_t next_update = update_ptr == update_ptr_end ? 0 : *update_ptr++; | 
|  |  | 
|  | // Bitwise or |64 - unused_bits_count| bits from the bottom of | 
|  | // |next_update| to the top of |update_for_current|. Only |popcount| bits | 
|  | // will actually be used by PDEP but masking off the unused bits takes | 
|  | // *more* instructions than not doing anything. | 
|  | update_for_current |= next_update << unused_bits_count; | 
|  |  | 
|  | // PDEP will use |popcount| bits from update: this means it will use | 
|  | // |unused_bits_count| from |update_for_current| and |popcount - | 
|  | // unused_bits_count| from |next_update| | 
|  | uint8_t used_next_bits = popcount - unused_bits_count; | 
|  |  | 
|  | // Shift off any bits which will be used by current and store the | 
|  | // remainder for use in the next iteration. | 
|  | update_unused_bits = | 
|  | used_next_bits == 64 ? 0 : next_update >> used_next_bits; | 
|  | unused_bits_count = 64 - used_next_bits; | 
|  | } | 
|  |  | 
|  | // We should never end up with more than 64 bits available. | 
|  | PERFETTO_CHECK(unused_bits_count <= 64); | 
|  |  | 
|  | // PDEP precisely captures the notion of "updating set bits" for a single | 
|  | // word. | 
|  | *ptr = Pdep(update_for_current, current); | 
|  | } | 
|  |  | 
|  | // We shouldn't have any non-zero unused bits and we should have consumed the | 
|  | // whole |update| bitvector. Note that we cannot really say anything about | 
|  | // |unused_bits_count| because it's possible for the above algorithm to use | 
|  | // some bits which are "past the end" of |update|; as long as these bits are | 
|  | // zero, it meets the pre-condition of this function. | 
|  | PERFETTO_DCHECK(update_unused_bits == 0); | 
|  | PERFETTO_DCHECK(update_ptr == update_ptr_end); | 
|  |  | 
|  | for (uint32_t i = 0; i < counts_.size() - 1; ++i) { | 
|  | counts_[i + 1] = counts_[i] + ConstBlockFromIndex(i).CountSetBits(); | 
|  | } | 
|  |  | 
|  | // After the loop, we should have precisely the same number of bits | 
|  | // set as |update|. | 
|  | PERFETTO_DCHECK(update.CountSetBits() == CountSetBits()); | 
|  | } | 
|  |  | 
|  | BitVector BitVector::IntersectRange(uint32_t range_start, | 
|  | uint32_t range_end) const { | 
|  | // We should skip all bits until the index of first set bit bigger than | 
|  | // |range_start|. | 
|  | uint32_t end_idx = std::min(range_end, size()); | 
|  |  | 
|  | if (range_start >= end_idx) | 
|  | return BitVector(); | 
|  |  | 
|  | Builder builder(end_idx, range_start); | 
|  | uint32_t front_bits = builder.BitsUntilWordBoundaryOrFull(); | 
|  | uint32_t cur_index = range_start; | 
|  | for (uint32_t i = 0; i < front_bits; ++i, ++cur_index) { | 
|  | builder.Append(IsSet(cur_index)); | 
|  | } | 
|  |  | 
|  | PERFETTO_DCHECK(cur_index == end_idx || cur_index % BitWord::kBits == 0); | 
|  | uint32_t cur_words = cur_index / BitWord::kBits; | 
|  | uint32_t full_words = builder.BitsInCompleteWordsUntilFull() / BitWord::kBits; | 
|  | uint32_t total_full_words = cur_words + full_words; | 
|  | for (; cur_words < total_full_words; ++cur_words) { | 
|  | builder.AppendWord(words_[cur_words]); | 
|  | } | 
|  |  | 
|  | uint32_t last_bits = builder.BitsUntilFull(); | 
|  | cur_index += full_words * BitWord::kBits; | 
|  | for (uint32_t i = 0; i < last_bits; ++i, ++cur_index) { | 
|  | builder.Append(IsSet(cur_index)); | 
|  | } | 
|  |  | 
|  | return std::move(builder).Build(); | 
|  | } | 
|  |  | 
|  | void BitVector::Serialize( | 
|  | protos::pbzero::SerializedColumn::BitVector* msg) const { | 
|  | msg->set_size(size_); | 
|  | if (!counts_.empty()) { | 
|  | msg->set_counts(reinterpret_cast<const uint8_t*>(counts_.data()), | 
|  | sizeof(uint32_t) * counts_.size()); | 
|  | } | 
|  | if (!words_.empty()) { | 
|  | msg->set_words(reinterpret_cast<const uint8_t*>(words_.data()), | 
|  | sizeof(uint64_t) * words_.size()); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Deserialize BitVector from proto. | 
|  | void BitVector::Deserialize( | 
|  | const protos::pbzero::SerializedColumn::BitVector::Decoder& bv_msg) { | 
|  | size_ = bv_msg.size(); | 
|  | if (bv_msg.has_counts()) { | 
|  | counts_.resize( | 
|  | static_cast<size_t>(bv_msg.counts().size / sizeof(uint32_t))); | 
|  | memcpy(counts_.data(), bv_msg.counts().data, bv_msg.counts().size); | 
|  | } else { | 
|  | counts_.clear(); | 
|  | } | 
|  |  | 
|  | if (bv_msg.has_words()) { | 
|  | words_.resize(static_cast<size_t>(bv_msg.words().size / sizeof(uint64_t))); | 
|  | memcpy(words_.data(), bv_msg.words().data, bv_msg.words().size); | 
|  | } else { | 
|  | words_.clear(); | 
|  | } | 
|  | } | 
|  |  | 
|  | }  // namespace trace_processor | 
|  | }  // namespace perfetto |