Hi,
I've tried starting a three node cluster with the biggest instance
type (m2.4xlarge) and it crashes here. This is with Justin's latest
git. Ideas?
Thanks,
Damian
eads_at_argentina:~/work/repo/StarCluster$ starcluster start -x mycluster dtest
StarCluster - (
http://web.mit.edu/starcluster)
Software Tools for Academics and Researchers (STAR)
Please submit bug reports to starcluster_at_mit.edu
>>> Validating cluster settings...
>>> Cluster settings are valid
>>> Starting cluster...
>>> Waiting for cluster to start...
>>> The master node is ec2-174-129-138-92.compute-1.amazonaws.com
>>> Attaching volume vol-1dbc0c74 to master node...
>>> Setting up the cluster...
>>> Mounting EBS volume vol-1dbc0c74 on /data...
ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
trying both
>>> Using private key /home/eads/deadskey.pem (rsa)
>>> Creating cluster user: sgeadmin
ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
trying both
>>> Using private key /home/eads/deadskey.pem (rsa)
ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
trying both
>>> Using private key /home/eads/deadskey.pem (rsa)
>>> Configuring scratch space for user: sgeadmin
>>> Configuring /etc/hosts on each node
>>> Configuring NFS...
ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line statement', (405, 0))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
in <module>()
3 __requires__ = 'StarCluster==0.9999'
4 import pkg_resources
----> 5 pkg_resources.run_script('StarCluster==0.9999', 'starcluster')
6
7
/usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
requires, script_name)
446 ns.clear()
447 ns['__name__'] = name
--> 448 self.require(requires)[0].run_script(script_name, ns)
449
450
/usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
script_name, namespace)
1171 )
1172 script_code = compile(script_text,script_filename,'exec')
-> 1173 exec script_code in namespace, namespace
1174
1175 def _has(self, path):
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
in <module>()
4
5
----> 6
7
8
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
in main()
918 sys.exit(0)
919 try:
--> 920 sc.execute(args)
921 except exception.BaseException,e:
922 log.error(e.msg)
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
in execute(self, args)
179 log.info('Cluster settings are valid')
180 if not self.opts.validate_only:
--> 181 scluster.start(create=not self.opts.no_create)
182 if self.opts.login_master:
183 cluster.ssh_to_master(tag, self.cfg)
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/utils.pyc
in wrapper(*arg, **kargs)
24 """Raw timing function """
25 time1 = time.time()
---> 26 res = func(*arg, **kargs)
27 time2 = time.time()
28 log.info('%s took %0.3f mins' % (func.func_name,
(time2-time1)/60.0))
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cluster.pyc
in start(self, create)
512 self.nodes, self.master_node,
513 self.cluster_user, self.cluster_shell,
--> 514 self.volumes
515 )
516 self.create_receipt()
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
in run(self, nodes, master, user, user_shell, volumes)
334 self._setup_scratch()
335 self._setup_etc_hosts()
--> 336 self._setup_nfs()
337 self._setup_passwordless_ssh()
338 self._setup_sge()
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
in _setup_nfs(self)
245 mconn.execute('/etc/init.d/nfs start')
246 mconn.execute('/usr/sbin/exportfs -r')
--> 247 mconn.execute('mount -t devpts none /dev/pts') # fix
for xterm/mpi printing to stdout
248
249 # setup /etc/fstab and mount /home and /opt/sge6 on each node
/tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/ssh.pyc
in execute(self, command, silent, only_printable, ignore_exit_status)
221 exit_status = channel.recv_exit_status()
222 if exit_status != 0:
--> 223 if not ignore_exist_status:
224 log.error("command %s failed with status %d" % (command,
225
exit_status))
--
-----------------------------------------------------
Damian Eads Ph.D. Candidate
University of California Computer Science
1156 High Street Machine Learning Lab, E2-489
Santa Cruz, CA 95064 http://www.soe.ucsc.edu/~eads
Received on Sat Apr 24 2010 - 06:36:40 EDT