// Get the list of files to search in this level FileMetaData*const* files =&files_[level][0]; if (level ==0) { //level0Ҏ(gu)处理Q因为key是重叠,所有符合条件的文g必须被查?/span> tmp.reserve(num_files); for (uint32_t i =0; i < num_files; i++) { FileMetaData* f = files[i]; if (ucmp->Compare(user_key, f->smallest.user_key()) >=0&& ucmp->Compare(user_key, f->largest.user_key()) <=0) { tmp.push_back(f); } } if (tmp.empty()) continue;
std::sort(tmp.begin(), tmp.end(), NewestFirst); files =&tmp[0]; num_files = tmp.size(); } else { // 二分法查找,某个key只可能属于一个文?/span> uint32_t index = FindFile(vset_->icmp_, files_[level], ikey); //没有查到 if (index >= num_files) { files = NULL; num_files =0; } else { tmp2 = files[index]; if (ucmp->Compare(user_key, tmp2->smallest.user_key()) <0) { // All of "tmp2" is past any data for user_key files = NULL; num_files =0; } else { files =&tmp2; num_files =1; } } }
for (uint32_t i =0; i < num_files; ++i) { //遍历本层W合条g的文?/span> if (last_file_read != NULL && stats->seek_file == NULL) { //seek_file只记录第一?/span> stats->seek_file = last_file_read; stats->seek_file_level = last_file_read_level; }
FileMetaData* f = files[i]; last_file_read = f; last_file_read_level = level;
//从LRU cache中查?/span> Cache::Handle* handle = cache_->Lookup(key); if (handle == NULL) { /加蝲文g std::string fname = TableFileName(dbname_, file_number); RandomAccessFile* file = NULL; Table* table = NULL; Status s = env_->NewRandomAccessFile(fname, &file); if (s.ok()) { s = Table::Open(*options_, file, file_size, &table); }
if (!s.ok()) { assert(table == NULL); delete file; // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. return NewErrorIterator(s); }
Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { Status status; //加锁 MutexLock l(&mutex_); LoggerId self; //拿到写log的权?/span> AcquireLoggingResponsibility(&self); //(g)查是否可?/span> status = MakeRoomForWrite(false); // May temporarily release lock and wait uint64_t last_sequence = versions_->LastSequence(); if (status.ok()) { WriteBatchInternal::SetSequence(updates, last_sequence +1); last_sequence += WriteBatchInternal::Count(updates);
// Add to log and apply to memtable. We can release the lock during // this phase since the "logger_" flag protects against concurrent // loggers and concurrent writes into mem_. { assert(logger_ ==&self); mutex_.Unlock(); //IO操作Q写入LOG status = log_->AddRecord(WriteBatchInternal::Contents(updates)); if (status.ok() && options.sync) { status = logfile_->Sync(); } //插入memtable if (status.ok()) { status = WriteBatchInternal::InsertInto(updates, mem_); } mutex_.Lock(); assert(logger_ ==&self); } //讄新的seqence number versions_->SetLastSequence(last_sequence); } //释放写LOG?/span> ReleaseLoggingResponsibility(&self); return status; }
写流量控Ӟ(x) <db/dbimpl.cc>
Status DBImpl::MakeRoomForWrite(bool force) { mutex_.AssertHeld(); assert(logger_ != NULL); bool allow_delay =!force; Status s; while (true) { if (!bg_error_.ok()) { // Yield previous error s = bg_error_; break; } elseif ( allow_delay && versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) { mutex_.Unlock(); //如果level0的文件大于kL0_SlowdownWritesTrigger阈|则sleep 1sQ这L(fng)compaction更多的CPU env_->SleepForMicroseconds(1000); allow_delay =false; // Do not delay a single write more than once mutex_.Lock(); } elseif (!force && (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { //可写 break; } elseif (imm_ != NULL) { // imm_:之前的memtable 没有被compactionQ需要等?/span> bg_cv_.Wait(); } elseif (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // level0文g个数大于kL0_StopWritesTrigger,需要等?/span> Log(options_.info_log, "waiting\n"); bg_cv_.Wait(); } else { //生成新的额memtable和logfileQ把当前memtable传给imm_ assert(versions_->PrevLogNumber() ==0); uint64_t new_log_number = versions_->NewFileNumber(); WritableFile* lfile = NULL; s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); if (!s.ok()) { break; } delete log_; delete logfile_; logfile_ = lfile; logfile_number_ = new_log_number; log_ =new log::Writer(lfile); imm_ = mem_; has_imm_.Release_Store(imm_); mem_ =new MemTable(internal_comparator_); mem_->Ref(); force =false; // Do not force another compaction if have room
// No copying allowed MemTable(const MemTable&); voidoperator=(const MemTable&); };
先看看插?br /><db/memtable.cc>
void MemTable::Add(SequenceNumber s, ValueType type, const Slice& key, const Slice& value) { //数据l构Q?br /> //1.internal key size : Varint32 (length of 2+3) //2.key data //3.SequenceNumber+Key type: 8 bytes //4 value size: Varint32 //5 value data size_t key_size = key.size(); size_t val_size = value.size(); size_t internal_key_size = key_size +8; const size_t encoded_len = VarintLength(internal_key_size) + internal_key_size + VarintLength(val_size) + val_size; char* buf = arena_.Allocate(encoded_len); char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; EncodeFixed64(p, (s <<8) | type); p +=8; p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((p + val_size) - buf == encoded_len); table_.Insert(buf); }
查询 <db/memtable.cc>
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); iter.Seek(memkey.data()); if (iter.Valid()) { // entry format is: // klength varint32 // userkey char[klength] // tag uint64 // vlength varint32 // value char[vlength] // Check that it belongs to same user key. We do not check the // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. constchar* entry = iter.key(); uint32_t key_length; constchar* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); if (comparator_.comparator.user_comparator()->Compare( Slice(key_ptr, key_length -8), key.user_key()) ==0) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length -8); switch (static_cast<ValueType>(tag &0xff)) { case kTypeValue: { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); value->assign(v.data(), v.size()); returntrue; } case kTypeDeletion: *s = Status::NotFound(Slice()); returntrue; } } } returnfalse; }
]]>leveldb研究7-Version/VersionSet/VersionEdithttp://m.tkk7.com/sandy/archive/2012/03/16/leveldb7.html明明Fri, 16 Mar 2012 09:10:00 GMThttp://m.tkk7.com/sandy/archive/2012/03/16/leveldb7.htmlhttp://m.tkk7.com/sandy/comments/372028.htmlhttp://m.tkk7.com/sandy/archive/2012/03/16/leveldb7.html#Feedback0http://m.tkk7.com/sandy/comments/commentRss/372028.htmlhttp://m.tkk7.com/sandy/services/trackbacks/372028.html
先看看一个重要的数据l果Qsst file的META info
<db/version_edit.h>
f->allowed_seeks = (f->file_size /16384); if (f->allowed_seeks <100) f->allowed_seeks =100;
原因Q请看leveldb的注释:(x)
// We arrange to automatically compact this file after a certain number of seeks. Let's assume:
// (1) One seek costs 10ms
// (2) Writing or reading 1MB costs 10ms (100MB/s)
// (3) A compaction of 1MB does 25MB of IO:
// 1MB read from this level
// 10-12MB read from next level (boundaries may be misaligned)
// 10-12MB written to next level
// This implies that 25 seeks cost the same as the compaction
// of 1MB of data. I.e., one seek costs approximately the
// same as the compaction of 40KB of data. We are a little
// conservative and allow approximately one seek for every 16KB
// of data before triggering a compaction.
接下来看Version的定义,version其实是一pd的SST file的集合?br />
class Version { public: //生成iterator用于遍历 void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
//Ҏ(gu)key来查询,若没有查刎ͼ更新GetStats struct GetStats {
FileMetaData* seek_file; int seek_file_level;
};
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
GetStats* stats);
//查询和key range有关的files void GetOverlappingInputs( int level, const InternalKey* begin, // NULL means before all keys const InternalKey* end, // NULL means after all keys std::vector<FileMetaData*>* inputs);
//memtable output应该攑ֈ哪个level int PickLevelForMemTableOutput(const Slice& smallest_user_key, const Slice& largest_user_key);
//某个level的文件个?/span> int NumFiles(int level) const { return files_[level].size(); }
// Return a human readable string that describes this version's contents. std::string DebugString() const;
private:
friend class Compaction;
friend class VersionSet;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version
WritableFile* descriptor_file_; log::Writer* descriptor_log_; Version dummy_versions_; // Head of circular doubly-linked list of versions. Version* current_; // == dummy_versions_.prev_
// We leave eight bits empty at the bottom so a type and sequence# // can be packed together into 64-bits. static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
SnapShot(db/snapshot.h),Q可以看出snapshot其实是一个sequence number
class SnapshotImpl : public Snapshot { public: //创徏后保持不?/span> SequenceNumber number_;
]]>Learn From HBase/Bigtablehttp://m.tkk7.com/sandy/archive/2012/03/07/Learn_From_HBase.html明明Wed, 07 Mar 2012 02:42:00 GMThttp://m.tkk7.com/sandy/archive/2012/03/07/Learn_From_HBase.htmlhttp://m.tkk7.com/sandy/comments/371379.htmlhttp://m.tkk7.com/sandy/archive/2012/03/07/Learn_From_HBase.html#Feedback0http://m.tkk7.com/sandy/comments/commentRss/371379.htmlhttp://m.tkk7.com/sandy/services/trackbacks/371379.html
1. 使用可信ȝ分布式组件来搭徏自己的分布式pȝ?/strong> ?计一个可靠,健壮的分布式pȝ是比较困隄。我们知道,Z(jin)防止SPOF(Single Point Of Failure)问题Q我们要分散风险Q把数据攑֜多个nodes上面去,但是q样带来?jin)是数据的同步问题和版本问题Q解册个问题需要运用复杂的 Paxos协议Q系l的复杂度自然就升高?sh)(jin)。另外一个需要解决的问题是分布式锁和事g通知机制Q以?qing)全局信息׃nQ设计这些都需要大量的_֊和仔l的? I?/p>