StarCluster - Mailing List Archive

Re: [Starcluster] cannot start cluster with m2.4xlarge

From: Damian Eads <no email>
Date: Sun, 25 Apr 2010 17:10:54 -0700

Excellent. It works now. My experiment has been running on 24 cores (3
x m2.4xlarge) with about 70% memory use for 5 hours without crashing.

Thanks,

Damian

On Sat, Apr 24, 2010 at 6:59 PM, Justin Riley <jtriley_at_mit.edu> wrote:
> Hi Damian,
>
> My bad, typo in the last commit. Should be fixed in github now.
>
> ~Justin
>
> Quoting Damian Eads <eads_at_soe.ucsc.edu>:
>
>> Hi,
>>
>> I've tried starting a three node cluster with the biggest instance
>> type (m2.4xlarge) and it crashes here. This is with Justin's latest
>> git. Ideas?
>>
>> Thanks,
>>
>> Damian
>>
>>
>> eads_at_argentina:~/work/repo/StarCluster$ starcluster start -x mycluster dtest
>> StarCluster - (http://web.mit.edu/starcluster)
>> Software Tools for Academics and Researchers (STAR)
>> Please submit bug reports to starcluster_at_mit.edu
>>
>>>>> Validating cluster settings...
>>>>> Cluster settings are valid
>>>>> Starting cluster...
>>>>> Waiting for cluster to start...
>>>>> The master node is ec2-174-129-138-92.compute-1.amazonaws.com
>>>>> Attaching volume vol-1dbc0c74 to master node...
>>>>> Setting up the cluster...
>>>>> Mounting EBS volume vol-1dbc0c74 on /data...
>> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
>> trying both
>>>>> Using private key /home/eads/deadskey.pem (rsa)
>>>>> Creating cluster user: sgeadmin
>> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
>> trying both
>>>>> Using private key /home/eads/deadskey.pem (rsa)
>> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
>> trying both
>>>>> Using private key /home/eads/deadskey.pem (rsa)
>>>>> Configuring scratch space for user: sgeadmin
>>>>> Configuring /etc/hosts on each node
>>>>> Configuring NFS...
>> ERROR: An unexpected error occurred while tokenizing input
>> The following traceback may be corrupted or invalid
>> The error message is: ('EOF in multi-line statement', (405, 0))
>>
>> ---------------------------------------------------------------------------
>> NameError                                 Traceback (most recent call last)
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
>> in <module>()
>>      3 __requires__ = 'StarCluster==0.9999'
>>      4 import pkg_resources
>> ----> 5 pkg_resources.run_script('StarCluster==0.9999', 'starcluster')
>>      6
>>      7
>>
>> /usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
>> requires, script_name)
>>    446         ns.clear()
>>    447         ns['__name__'] = name
>> --> 448         self.require(requires)[0].run_script(script_name, ns)
>>    449
>>    450
>>
>> /usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
>> script_name, namespace)
>>   1171             )
>>   1172             script_code = compile(script_text,script_filename,'exec')
>> -> 1173             exec script_code in namespace, namespace
>>   1174
>>   1175     def _has(self, path):
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
>> in <module>()
>>      4
>>      5
>> ----> 6
>>      7
>>      8
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
>> in main()
>>    918         sys.exit(0)
>>    919     try:
>> --> 920         sc.execute(args)
>>    921     except exception.BaseException,e:
>>    922         log.error(e.msg)
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
>> in execute(self, args)
>>    179             log.info('Cluster settings are valid')
>>    180             if not self.opts.validate_only:
>> --> 181                 scluster.start(create=not self.opts.no_create)
>>    182                 if self.opts.login_master:
>>    183                     cluster.ssh_to_master(tag, self.cfg)
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/utils.pyc
>> in wrapper(*arg, **kargs)
>>     24         """Raw timing function """
>>     25         time1 = time.time()
>> ---> 26         res = func(*arg, **kargs)
>>     27         time2 = time.time()
>>     28         log.info('%s took %0.3f mins' % (func.func_name,
>> (time2-time1)/60.0))
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cluster.pyc
>> in start(self, create)
>>    512             self.nodes, self.master_node,
>>    513             self.cluster_user, self.cluster_shell,
>> --> 514             self.volumes
>>    515         )
>>    516         self.create_receipt()
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
>> in run(self, nodes, master, user, user_shell, volumes)
>>    334         self._setup_scratch()
>>    335         self._setup_etc_hosts()
>> --> 336         self._setup_nfs()
>>    337         self._setup_passwordless_ssh()
>>    338         self._setup_sge()
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
>> in _setup_nfs(self)
>>    245         mconn.execute('/etc/init.d/nfs start')
>>    246         mconn.execute('/usr/sbin/exportfs -r')
>> --> 247         mconn.execute('mount -t devpts none /dev/pts') # fix
>> for xterm/mpi printing to stdout
>>    248
>>    249         # setup /etc/fstab and mount /home and /opt/sge6 on each node
>>
>>
>> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/ssh.pyc
>> in execute(self, command, silent, only_printable, ignore_exit_status)
>>    221         exit_status = channel.recv_exit_status()
>>    222         if exit_status != 0:
>> --> 223             if not ignore_exist_status:
>>    224                 log.error("command %s failed with status %d" %
>> (command,
>>    225
>> exit_status))
>>
>>
>> --
>> -----------------------------------------------------
>> Damian Eads                           Ph.D. Candidate
>> University of California             Computer Science
>> 1156 High Street         Machine Learning Lab, E2-489
>> Santa Cruz, CA 95064    http://www.soe.ucsc.edu/~eads
>> _______________________________________________
>> Starcluster mailing list
>> Starcluster_at_mit.edu
>> http://mailman.mit.edu/mailman/listinfo/starcluster
>>
>
>
> _______________________________________________
> Starcluster mailing list
> Starcluster_at_mit.edu
> http://mailman.mit.edu/mailman/listinfo/starcluster
>



-- 
-----------------------------------------------------
Damian Eads                           Ph.D. Candidate
University of California             Computer Science
1156 High Street         Machine Learning Lab, E2-489
Santa Cruz, CA 95064    http://www.soe.ucsc.edu/~eads
Received on Sun Apr 25 2010 - 20:10:56 EDT
This archive was generated by hypermail 2.3.0.

Search:

Sort all by:

Date

Month

Thread

Author

Subject