深入浅出Zookeeper之一Server启动

wbj0110

浏览: 1639598 次
性别:
来自: 上海

最近访客更多访客>>

一往无前bhz

ninja2006

loginboot

u012363178

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Zookeeper

ZooKeeper

大名鼎鼎的Zookeeper是解决分布式问题的神器。小编最近简单阅读了代码，分享一下。有不对之处，还请大家指出。

整篇文章将分多个系列完成，因为涉及点比较多，很难在一片文章内搞定。关于zookeeper的使用场景，大家参考http://rdc.taobao.com/team/jm/archives/1232。api使用参考官网手http://zookeeper.apache.org/doc/trunk/。这里以最新的zookeeper3.4.5为例。

这个系列的第一篇来说说zookeeper server端的启动，以单机为例，分布式zookeeper将在后续专门分析。

单机版启动类ZooKeeperServerMain

Java代码  
protected void initializeAndRun(String[] args)  
        throws ConfigException, IOException  
    {  
        try {  
            ManagedUtil.registerLog4jMBeans();  
        } catch (JMException e) {  
            LOG.warn("Unable to register log4j JMX control", e);  
        }  
    //解析配置文件zoo.cfg  
        ServerConfig config = new ServerConfig();  
        if (args.length == 1) {  
            config.parse(args[0]);  
        } else {  
            config.parse(args);  
        }  
    //启动  
        runFromConfig(config);  
    }  

具体解析：

Java代码  
public void parse(String path) throws ConfigException {  
      QuorumPeerConfig config = new QuorumPeerConfig();  
      config.parse(path);  
  
      // let qpconfig parse the file and then pull the stuff we are  
      // interested in  
      readFrom(config);  
  }  

启动

Java代码  
  public void runFromConfig(ServerConfig config) throws IOException {  
      LOG.info("Starting server");  
      try {  
          // Note that this thread isn't going to be doing anything else,  
          // so rather than spawning another thread, we will just call  
          // run() in this thread.  
          // create a file logger url from the command line args  
          ZooKeeperServer zkServer = new ZooKeeperServer();  
  
//2个文件，log和data文件  
          FileTxnSnapLog ftxn = new FileTxnSnapLog(new  
                 File(config.dataLogDir), new File(config.dataDir));  
          zkServer.setTxnLogFactory(ftxn);  
          zkServer.setTickTime(config.tickTime);  
          zkServer.setMinSessionTimeout(config.minSessionTimeout);  
          zkServer.setMaxSessionTimeout(config.maxSessionTimeout);  
//连接工厂，默认NIOServerCnxnFactory  
          cnxnFactory = ServerCnxnFactory.createFactory();  
//初始化主线程，打开selector，并bind端口，打开NIO的ACCEPT通知  
          cnxnFactory.configure(config.getClientPortAddress(),  
                  config.getMaxClientCnxns());  
//并生成最新的snapshot文件，启动IO主线程，从snapshot文件和log文件中恢复内存database结构和session结构  
          cnxnFactory.startup(zkServer);  
//启动线程等待之前启动的主线程结束  
          cnxnFactory.join();  
          if (zkServer.isRunning()) {  
              zkServer.shutdown();  
          }  
      } catch (InterruptedException e) {  
          // warn, but generally this is ok  
          LOG.warn("Server interrupted", e);  
      }  
  }  

具体startup流程：

Java代码  
   public void startup(ZooKeeperServer zks) throws IOException,  
           InterruptedException {  
//启动IO主线程  
       start();  
//从log和snapshot回复database和session，并重新生成一个最新的snapshot文件  
       zks.startdata();  
//启动sessionTracker线程，初始化IO请求的处理链，并启动每个processor线程  
       zks.startup();  
       setZooKeeperServer(zks);  
   }  

具体恢复过程：

Java代码  
  public void startdata()   
  throws IOException, InterruptedException {  
      //check to see if zkDb is not null  
      if (zkDb == null) {  
//初始化database  
          zkDb = new ZKDatabase(this.txnLogFactory);  
      }    
      if (!zkDb.isInitialized()) {  
          loadData();  
      }  
  }  

DataTree用Map实现，key是节点名称，value是DataNode，DataNode从有parent指向父亲节点，有children指向所有孩子节点

Java代码  
   public DataTree() {  
       /* Rather than fight it, let root have an alias */  
//'/','/zookeeper','/zookeeper/quota'3个系统节点初始化  
       nodes.put("", root);  
       nodes.put(rootZookeeper, root);  
  
       /** add the proc node and quota node */  
       root.addChild(procChildZookeeper);  
       nodes.put(procZookeeper, procDataNode);  
  
       procDataNode.addChild(quotaChildZookeeper);  
       nodes.put(quotaZookeeper, quotaDataNode);  
   }  

具体恢复数据

Java代码  
   public void loadData() throws IOException, InterruptedException {  
//执行恢复，并返回最新的事务ID  
       setZxid(zkDb.loadDataBase());  
       // Clean up dead sessions  
//清理session  
       LinkedList<Long> deadSessions = new LinkedList<Long>();  
       for (Long session : zkDb.getSessions()) {  
           if (zkDb.getSessionWithTimeOuts().get(session) == null) {  
               deadSessions.add(session);  
           }  
       }  
       zkDb.setDataTreeInit(true);  
       for (long session : deadSessions) {  
           // XXX: Is lastProcessedZxid really the best thing to use?  
           killSession(session, zkDb.getDataTreeLastProcessedZxid());  
       }  
//生成最新的snapshot文件  
       // Make a clean snapshot  
       takeSnapshot();  
   }  

load过程：

Java代码  
public long loadDataBase() throws IOException {  
oad过程中，发起分布式提议，对于单机版，先不考虑  
    PlayBackListener listener=new PlayBackListener(){  
        public void onTxnLoaded(TxnHeader hdr,Record txn){  
            Request r = new Request(null, 0, hdr.getCxid(),hdr.getType(),  
                    null, null);  
            r.txn = txn;  
            r.hdr = hdr;  
            r.zxid = hdr.getZxid();  
            addCommittedProposal(r);  
        }  
    };  
    //load数据  
    long zxid = snapLog.restore(dataTree,sessionsWithTimeouts,listener);  
    initialized = true;  
     

restore过程：

Java代码  
   public long restore(DataTree dt, Map<Long, Integer> sessions,   
           PlayBackListener listener) throws IOException {  
//从FileSnap中恢复  
       snapLog.deserialize(dt, sessions);  
       FileTxnLog txnLog = new FileTxnLog(dataDir);  
       TxnIterator itr = txnLog.read(dt.lastProcessedZxid+1);  
       long highestZxid = dt.lastProcessedZxid;  
       TxnHeader hdr;  
//从snapshot中记录的最新的事务开始处理，将log中的事务merge到datatree中  
       while (true) {  
           // iterator points to   
           // the first valid txn when initialized  
           hdr = itr.getHeader();  
           if (hdr == null) {  
               //empty logs   
               return dt.lastProcessedZxid;  
           }  
           if (hdr.getZxid() < highestZxid && highestZxid != 0) {  
               LOG.error(highestZxid + "(higestZxid) > "  
                       + hdr.getZxid() + "(next log) for type "  
                       + hdr.getType());  
           } else {  
               highestZxid = hdr.getZxid();  
           }  
           try {  
               processTransaction(hdr,dt,sessions, itr.getTxn());  
           } catch(KeeperException.NoNodeException e) {  
              throw new IOException("Failed to process transaction type: " +  
                    hdr.getType() + " error: " + e.getMessage(), e);  
           }  
           listener.onTxnLoaded(hdr, itr.getTxn());  
           if (!itr.next())   
               break;  
       }  
       return highestZxid;  
   }  

FileSnap恢复过程：

Java代码  
   public long deserialize(DataTree dt, Map<Long, Integer> sessions)  
           throws IOException {  
       // we run through 100 snapshots (not all of them)  
       // if we cannot get it running within 100 snapshots  
       // we should  give up  
//找前100个snapshot文件，降序，最新的文件在最前面  
       List<File> snapList = findNValidSnapshots(100);  
       if (snapList.size() == 0) {  
           return -1L;  
       }  
//从最新的文件开始恢复，如果反序列化ok而且checksum也ok，则恢复结束  
       File snap = null;  
       boolean foundValid = false;  
       for (int i = 0; i < snapList.size(); i++) {  
           snap = snapList.get(i);  
           InputStream snapIS = null;  
           CheckedInputStream crcIn = null;  
           try {  
               LOG.info("Reading snapshot " + snap);  
               snapIS = new BufferedInputStream(new FileInputStream(snap));  
               crcIn = new CheckedInputStream(snapIS, new Adler32());  
               InputArchive ia = BinaryInputArchive.getArchive(crcIn);  
               deserialize(dt,sessions, ia);  
               long checkSum = crcIn.getChecksum().getValue();  
               long val = ia.readLong("val");  
               if (val != checkSum) {  
                   throw new IOException("CRC corruption in snapshot :  " + snap);  
               }  
               foundValid = true;  
               break;  
           } catch(IOException e) {  
               LOG.warn("problem reading snap file " + snap, e);  
           } finally {  
               if (snapIS != null)   
                   snapIS.close();  
               if (crcIn != null)   
                   crcIn.close();  
           }   
       }  
       if (!foundValid) {  
           throw new IOException("Not able to find valid snapshots in " + snapDir);  
       }  
//snapshot文件名就记录着最新的zxid  
       dt.lastProcessedZxid = Util.getZxidFromName(snap.getName(), "snapshot");  
       return dt.lastProcessedZxid;  
   }  

单个事务处理：

Java代码  
public void processTransaction(TxnHeader hdr,DataTree dt,  
         Map<Long, Integer> sessions, Record txn)  
     throws KeeperException.NoNodeException {  
     ProcessTxnResult rc;  
     switch (hdr.getType()) {  
创建session  
     case OpCode.createSession:  
         sessions.put(hdr.getClientId(),  
                 ((CreateSessionTxn) txn).getTimeOut());  
         ......  
         // give dataTree a chance to sync its lastProcessedZxid  
         rc = dt.processTxn(hdr, txn);  
         break;  
     case OpCode.closeSession:  
         sessions.remove(hdr.getClientId());  
         if (LOG.isTraceEnabled()) {  
             ZooTrace.logTraceMessage(LOG,ZooTrace.SESSION_TRACE_MASK,  
                     "playLog --- close session in log: 0x"  
                             + Long.toHexString(hdr.getClientId()));  
         }  
         rc = dt.processTxn(hdr, txn);  
         break;  
     default:  
         rc = dt.processTxn(hdr, txn);  
     }  
  
     ......  
 }  

DataTree处理单个事务

Java代码  
public ProcessTxnResult processTxn(TxnHeader header, Record txn)  
    {  
        ProcessTxnResult rc = new ProcessTxnResult();  
  
        try {  
            rc.clientId = header.getClientId();  
            rc.cxid = header.getCxid();  
            rc.zxid = header.getZxid();  
            rc.type = header.getType();  
            rc.err = 0;  
            rc.multiResult = null;  
            switch (header.getType()) {  
                case OpCode.create:  
                    CreateTxn createTxn = (CreateTxn) txn;  
                    rc.path = createTxn.getPath();  
                    createNode(  
                            createTxn.getPath(),  
                            createTxn.getData(),  
                            createTxn.getAcl(),  
                            createTxn.getEphemeral() ? header.getClientId() : 0,  
                            createTxn.getParentCVersion(),  
                            header.getZxid(), header.getTime());  
                    break;  
                case OpCode.delete:  
                    DeleteTxn deleteTxn = (DeleteTxn) txn;  
                    rc.path = deleteTxn.getPath();  
                    deleteNode(deleteTxn.getPath(), header.getZxid());  
                    break;  
                case OpCode.setData:  
                    SetDataTxn setDataTxn = (SetDataTxn) txn;  
                    rc.path = setDataTxn.getPath();  
                    rc.stat = setData(setDataTxn.getPath(), setDataTxn  
                            .getData(), setDataTxn.getVersion(), header  
                            .getZxid(), header.getTime());  
                    break;  
   》    ......  
        /* 
         * A snapshot might be in progress while we are modifying the data 
         * tree. If we set lastProcessedZxid prior to making corresponding 
         * change to the tree, then the zxid associated with the snapshot 
         * file will be ahead of its contents. Thus, while restoring from 
         * the snapshot, the restore method will not apply the transaction 
         * for zxid associated with the snapshot file, since the restore 
         * method assumes that transaction to be present in the snapshot. 
         * 
         * To avoid this, we first apply the transaction and then modify 
         * lastProcessedZxid.  During restore, we correctly handle the 
         * case where the snapshot contains data ahead of the zxid associated 
         * with the file. 
         */  
    //处理完事务后，再修改最新Zxid，如果是先修改Zxid再处理事务，修改完Zxid后，正好异步线程flush datatree  
    //此时由于事务并没有被处理，导致snapshot中的zxid比content新，而restore的时候是从最新zxid+1开始恢复的，从而  
    //导致丢数据  
        if (rc.zxid > lastProcessedZxid) {  
            lastProcessedZxid = rc.zxid;  
        }  
  
        ......  
        return rc;  
    }  

以上就完成了server的数据恢复过程，LSM的精华所在。

接下来server启动sessionTracker线程和请求处理链

Java代码  
protected void setupRequestProcessors() {  
       RequestProcessor finalProcessor = new FinalRequestProcessor(this);  
       RequestProcessor syncProcessor = new SyncRequestProcessor(this,  
               finalProcessor);  
       ((SyncRequestProcessor)syncProcessor).start();  
       firstProcessor = new PrepRequestProcessor(this, syncProcessor);  
       ((PrepRequestProcessor)firstProcessor).start();  
   }  

核心IO线程

Java代码  
public void run() {  
        while (!ss.socket().isClosed()) {  
            try {  
        //select过程  
                selector.select(1000);  
                Set<SelectionKey> selected;  
                synchronized (this) {  
                    selected = selector.selectedKeys();  
                }  
                ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>(  
                        selected);  
        //打乱顺序  
                Collections.shuffle(selectedList);  
                for (SelectionKey k : selectedList) {  
            //新连接进来，accept之  
                    if ((k.readyOps() & SelectionKey.OP_ACCEPT) != 0) {  
                        SocketChannel sc = ((ServerSocketChannel) k  
                                .channel()).accept();  
                        InetAddress ia = sc.socket().getInetAddress();  
                        int cnxncount = getClientCnxnCount(ia);  
            //校验同个client连接数是否超过限制  
                        if (maxClientCnxns > 0 && cnxncount >= maxClientCnxns){  
                            LOG.warn("Too many connections from " + ia  
                                     + " - max is " + maxClientCnxns );  
                            sc.close();  
                        } else {  
                            LOG.info("Accepted socket connection from "  
                                     + sc.socket().getRemoteSocketAddress());  
                //异步模式  
                            sc.configureBlocking(false);  
                //监听read事件  
                            SelectionKey sk = sc.register(selector,  
                                    SelectionKey.OP_READ);  
                //创建内部连接  
                            NIOServerCnxn cnxn = createConnection(sc, sk);  
                            sk.attach(cnxn);  
                //添加到连接表，方便后续统计  
                            addCnxn(cnxn);  
                        }  
                    }   
            //如果是read和write事件，则处理之  
            else if ((k.readyOps() & (SelectionKey.OP_READ | SelectionKey.OP_WRITE)) != 0) {  
                        NIOServerCnxn c = (NIOServerCnxn) k.attachment();  
                        c.doIO(k);  
                    } else {  
                        if (LOG.isDebugEnabled()) {  
                            LOG.debug("Unexpected ops in select "  
                                      + k.readyOps());  
                        }  
                    }  
                }  
        //准备下次IO  
                selected.clear();  
            } catch (RuntimeException e) {  
                LOG.warn("Ignoring unexpected runtime exception", e);  
            } catch (Exception e) {  
                LOG.warn("Ignoring exception", e);  
            }  
        }  
        closeAll();  
        LOG.info("NIOServerCnxn factory exited run method");  
    }  

具体io处理过程，将在后续结合实例来讲解。

至此server启动完成，就等待client去连接了。server启动核心功能就是从snapshot和log文件中恢复datatree，其核心就是zxid，典型的LSM应用。

http://iwinit.iteye.com/blog/1754123

分享到：

深入浅出Zookeeper之二Session建立 | ZooKeeper

2015-01-29 09:58
浏览 953
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论