┌──(hadoop㉿hadoop-master)-[~]
└─$ # Stop YARN completely
stop-yarn.sh
start-yarn.sh
sleep 15
yarn node -list
Stopping nodemanagers
hadoop-worker2: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9
hadoop-worker1: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9
Stopping resourcemanager
Starting resourcemanager
Starting nodemanagers
2025-05-26 20:57:40,788 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop-master/127.0.1.1:8032
Total Nodes:0
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
┌──(hadoop㉿hadoop-master)-[~]
└─$ ┌──(hadoop㉿hadoop-master)-[~]
└─$ # Stop YARN completely
stop-yarn.sh
start-yarn.sh
sleep 15
yarn node -list
Stopping nodemanagers
hadoop-worker2: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9
hadoop-worker1: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9
Stopping resourcemanager
Starting resourcemanager
Starting nodemanagers
2025-05-26 20:57:40,788 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop-master/127.0.1.1:8032
Total Nodes:0
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
┌──(hadoop㉿hadoop-master)-[~]
└─$
-bash: syntax error near unexpected token hadoop㉿hadoop-master' └─$: command not found Stopping nodemanagers hadoop-worker2: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 hadoop-worker1: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 Stopping resourcemanager Starting resourcemanager Starting nodemanagers 2025-05-26 20:58:40,990 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop-master/127.0.1.1:8032 Total Nodes:0 Node-Id Node-State Node-Http-Address Number-of-Running-Containers Stopping: command not found hadoop-worker2:: command not found hadoop-worker1:: command not found Stopping: command not found Starting: command not found Starting: command not found 2025-05-26: command not found Total: command not found Node-Id: command not found -bash: syntax error near unexpected token hadoop㉿hadoop-master'
└─$: command not found
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Check NodeManager processes
echo "=== Checking NodeManager processes ==="
ssh hadoop-worker1 "jps | grep -i nodemanager"
ssh hadoop-worker2 "jps | grep -i nodemanager"
echo "=== Worker1 NodeManager startup log ==="
ssh hadoop-worker1 "tail -20 $HADOOP_HOME/logs/hadoop-hadoop-nodemanager-hadoop-worker1.out"
echo "=== Worker2 NodeManager startup log ==="
ssh hadoop-worker2 "tail -20 $HADOOP_HOME/logs/hadoop-hadoop-nodemanager-hadoop-worker2.out"
=== Checking NodeManager processes ===
4539 NodeManager
4561 NodeManager
=== Worker1 NodeManager startup log ===
real-time priority (-r) 0
stack size (kbytes, -s) 8192
cpu time (seconds, -t) unlimited
max user processes (-u) 7188
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
May 26, 2025 8:58:20 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory register
INFO: Registering org.apache.hadoop.yarn.server.nodemanager.webapp.NMWebServices as a root resource class
May 26, 2025 8:58:20 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory register
INFO: Registering org.apache.hadoop.yarn.webapp.GenericExceptionHandler as a provider class
May 26, 2025 8:58:20 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory register
INFO: Registering org.apache.hadoop.yarn.server.nodemanager.webapp.JAXBContextResolver as a provider class
May 26, 2025 8:58:20 PM com.sun.jersey.server.impl.application.WebApplicationImpl _initiate
INFO: Initiating Jersey application, version 'Jersey: 1.19 02/11/2015 03:25 AM'
May 26, 2025 8:58:20 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory getComponentProvider
INFO: Binding org.apache.hadoop.yarn.server.nodemanager.webapp.JAXBContextResolver to GuiceManagedComponentProvider with the scope "Singleton"
May 26, 2025 8:58:21 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory getComponentProvider
INFO: Binding org.apache.hadoop.yarn.webapp.GenericExceptionHandler to GuiceManagedComponentProvider with the scope "Singleton"
May 26, 2025 8:58:22 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory getComponentProvider
INFO: Binding org.apache.hadoop.yarn.server.nodemanager.webapp.NMWebServices to GuiceManagedComponentProvider with the scope "Singleton"
=== Worker2 NodeManager startup log ===
real-time priority (-r) 0
stack size (kbytes, -s) 8192
cpu time (seconds, -t) unlimited
max user processes (-u) 7188
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
May 26, 2025 8:58:25 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory register
INFO: Registering org.apache.hadoop.yarn.server.nodemanager.webapp.NMWebServices as a root resource class
May 26, 2025 8:58:25 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory register
INFO: Registering org.apache.hadoop.yarn.webapp.GenericExceptionHandler as a provider class
May 26, 2025 8:58:25 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory register
INFO: Registering org.apache.hadoop.yarn.server.nodemanager.webapp.JAXBContextResolver as a provider class
May 26, 2025 8:58:25 PM com.sun.jersey.server.impl.application.WebApplicationImpl _initiate
INFO: Initiating Jersey application, version 'Jersey: 1.19 02/11/2015 03:25 AM'
May 26, 2025 8:58:25 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory getComponentProvider
INFO: Binding org.apache.hadoop.yarn.server.nodemanager.webapp.JAXBContextResolver to GuiceManagedComponentProvider with the scope "Singleton"
May 26, 2025 8:58:26 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory getComponentProvider
INFO: Binding org.apache.hadoop.yarn.webapp.GenericExceptionHandler to GuiceManagedComponentProvider with the scope "Singleton"
May 26, 2025 8:58:26 PM com.sun.jersey.guice.spi.container.GuiceComponentProviderFactory getComponentProvider
INFO: Binding org.apache.hadoop.yarn.server.nodemanager.webapp.NMWebServices to GuiceManagedComponentProvider with the scope "Singleton"
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Manually start NodeManagers
echo "=== Manually starting NodeManager on worker1 ==="
ssh hadoop-worker1 "$HADOOP_HOME/bin/yarn --daemon start nodemanager"
echo "=== Manually starting NodeManager on worker2 ==="
ssh hadoop-worker2 "$HADOOP_HOME/bin/yarn --daemon start nodemanager"
sleep 10
yarn node -list
=== Manually starting NodeManager on worker1 ===
nodemanager is running as process 4539. Stop it first and ensure /tmp/hadoop-hadoop-nodemanager.pid file is empty before retry.
=== Manually starting NodeManager on worker2 ===
nodemanager is running as process 4561. Stop it first and ensure /tmp/hadoop-hadoop-nodemanager.pid file is empty before retry.
2025-05-26 20:59:38,665 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop-master/127.0.1.1:8032
Total Nodes:0
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
┌──(hadoop㉿hadoop-master)-[~]
└─$
Great! The NodeManagers are actually running (processes 4539 and 4561), but they're not registering with the ResourceManager. Let's check the NodeManager logs to see why they can't connect:
# Check the actual NodeManager logs for connection issues
echo "=== Worker1 NodeManager main log ==="
ssh hadoop-worker1 "tail -50 $HADOOP_HOME/logs/hadoop-hadoop-nodemanager-hadoop-worker1.log | grep -E 'ERROR|WARN|Exception|Connection'"
echo "=== Worker2 NodeManager main log ==="
ssh hadoop-worker2 "tail -50 $HADOOP_HOME/logs/hadoop-hadoop-nodemanager-hadoop-worker2.log | grep -E 'ERROR|WARN|Exception|Connection'"Let's also check if the ResourceManager is accessible from the workers:
# Test connectivity from workers to ResourceManager
echo "=== Testing ResourceManager connectivity from workers ==="
ssh hadoop-worker1 "telnet hadoop-master 8032 < /dev/null"
ssh hadoop-worker2 "telnet hadoop-master 8032 < /dev/null"
# Check if ResourceManager is listening on the right interface
echo "=== Checking ResourceManager binding ==="
netstat -tlnp | grep 8032Let's also check the ResourceManager logs for any issues:
# Check ResourceManager logs for worker registration attempts
echo "=== ResourceManager logs ==="
tail -50 $HADOOP_HOME/logs/hadoop-hadoop-resourcemanager-hadoop-master.log | grep -E 'NodeManager|worker|ERROR|WARN'If there are connectivity issues, we might need to update the ResourceManager binding:
# Check current core-site.xml
echo "=== Current core-site.xml ==="
cat $HADOOP_HOME/etc/hadoop/core-site.xmlRun these commands to diagnose why the NodeManagers aren't registering with the ResourceManager.
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Check the actual NodeManager logs for connection issues
echo "=== Worker1 NodeManager main log ==="
ssh hadoop-worker1 "tail -50 $HADOOP_HOME/logs/hadoop-hadoop-nodemanager-hadoop-worker1.log | grep -E 'ERROR|WARN|Exception|Connection'"
echo "=== Worker2 NodeManager main log ==="
ssh hadoop-worker2 "tail -50 $HADOOP_HOME/logs/hadoop-hadoop-nodemanager-hadoop-worker2.log | grep -E 'ERROR|WARN|Exception|Connection'"
=== Worker1 NodeManager main log ===
2025-05-26 20:58:19,778 WARN org.apache.hadoop.security.authentication.server.AuthenticationFilter: Unable to initialize FileSignerSecretProvider, falling back to use random secrets. Reason: Could not read signature secret file: /home/hadoop/hadoop-http-auth-signature-secret
=== Worker2 NodeManager main log ===
2025-05-26 20:58:24,506 WARN org.apache.hadoop.security.authentication.server.AuthenticationFilter: Unable to initialize FileSignerSecretProvider, falling back to use random secrets. Reason: Could not read signature secret file: /home/hadoop/hadoop-http-auth-signature-secret
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Test connectivity from workers to ResourceManager
echo "=== Testing ResourceManager connectivity from workers ==="
ssh hadoop-worker1 "telnet hadoop-master 8032 < /dev/null"
ssh hadoop-worker2 "telnet hadoop-master 8032 < /dev/null"
echo "=== Checking ResourceManager binding ==="
netstat -tlnp | grep 8032
=== Testing ResourceManager connectivity from workers ===
Trying 192.168.1.100...
telnet: Unable to connect to remote host: Connection refused
Trying 192.168.1.100...
telnet: Unable to connect to remote host: Connection refused
=== Checking ResourceManager binding ===
(Not all processes could be identified, non-owned process info
will not be shown, you would have to be root to see it all.)
tcp6 0 0 127.0.1.1:8032 :::* LISTEN 11328/java
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Test connectivity from workers to ResourceManager
echo "=== Testing ResourceManager connectivity from workers ==="
ssh hadoop-worker1 "telnet hadoop-master 8032 < /dev/null"
ssh hadoop-worker2 "telnet hadoop-master 8032 < /dev/null"
echo "=== Checking ResourceManager binding ==="
netstat -tlnp | grep 8032
=== Testing ResourceManager connectivity from workers ===
Trying 192.168.1.100...
telnet: Unable to connect to remote host: Connection refused
Trying 192.168.1.100...
telnet: Unable to connect to remote host: Connection refused
=== Checking ResourceManager binding ===
(Not all processes could be identified, non-owned process info
will not be shown, you would have to be root to see it all.)
tcp6 0 0 127.0.1.1:8032 :::* LISTEN 11328/java
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Check ResourceManager logs for worker registration attempts
echo "=== ResourceManager logs ==="
tail -50 $HADOOP_HOME/logs/hadoop-hadoop-resourcemanager-hadoop-master.log | grep -E 'NodeManager|worker|ERROR|WARN'
=== ResourceManager logs ===
┌──(hadoop㉿hadoop-master)-[~]
└─$ # Check current core-site.xml
echo "=== Current core-site.xml ==="
cat $HADOOP_HOME/etc/hadoop/core-site.xml
=== Current core-site.xml ===
┌──(hadoop㉿hadoop-master)-[~]
└─$
Found the issue! The ResourceManager is binding to 127.0.1.1:8032 (localhost) instead of the actual IP address 192.168.1.100, so the worker nodes can't connect to it.
Let's fix this by updating the yarn-site.xml to bind to the correct IP address:
# Stop YARN first
stop-yarn.sh
# Update yarn-site.xml to use IP addresses instead of hostnames
cat > $HADOOP_HOME/etc/hadoop/yarn-site.xml << 'EOF'
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>192.168.1.100:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>192.168.1.100:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>192.168.1.100:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>192.168.1.100:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>192.168.1.100:8088</value>
</property>
<property>
<name>yarn.resourcemanager.bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>128</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.app.mapreduce.am.resource.mb</name>
<value>512</value>
</property>
</configuration>
EOF
# Copy the updated configuration to workers
scp $HADOOP_HOME/etc/hadoop/yarn-site.xml hadoop-worker1:$HADOOP_HOME/etc/hadoop/
scp $HADOOP_HOME/etc/hadoop/yarn-site.xml hadoop-worker2:$HADOOP_HOME/etc/hadoop/
# Start YARN again
start-yarn.sh
# Wait for startup
sleep 15
# Check if ResourceManager is now binding to the correct interface
echo "=== Checking ResourceManager binding ==="
netstat -tlnp | grep 8032
# Test connectivity from workers
echo "=== Testing connectivity ==="
ssh hadoop-worker1 "telnet 192.168.1.100 8032 < /dev/null"
# Check nodes
yarn node -listThe key changes:
yarn.resourcemanager.bind-host=0.0.0.0 to make ResourceManager listen on all interfacesThis should allow the NodeManagers to connect to the ResourceManager properly.