diff --git a/pkg/controller/cluster/client.go b/pkg/controller/cluster/client.go index 785ad285..f924c9b1 100644 --- a/pkg/controller/cluster/client.go +++ b/pkg/controller/cluster/client.go @@ -23,7 +23,7 @@ func newVirtualClient(ctx context.Context, hostClient ctrlruntimeclient.Client, } if err := hostClient.Get(ctx, kubeconfigSecretName, &clusterKubeConfig); err != nil { - return nil, fmt.Errorf("failed to get kubeconfig secret: %w", err) + return nil, err } restConfig, err := clientcmd.RESTConfigFromKubeConfig(clusterKubeConfig.Data["kubeconfig.yaml"]) diff --git a/pkg/controller/cluster/server/server.go b/pkg/controller/cluster/server/server.go index 664a082f..16f491ed 100644 --- a/pkg/controller/cluster/server/server.go +++ b/pkg/controller/cluster/server/server.go @@ -416,9 +416,11 @@ func (s *Server) setupDynamicPersistence() v1.PersistentVolumeClaim { func (s *Server) setupStartCommand() (string, error) { var output bytes.Buffer - tmpl := singleServerTemplate + tmpl := StartupCommand + + mode := "single" if *s.cluster.Spec.Servers > 1 { - tmpl = HAServerTemplate + mode = "ha" } tmplCmd, err := template.New("").Parse(tmpl) @@ -430,6 +432,8 @@ func (s *Server) setupStartCommand() (string, error) { "ETCD_DIR": "/var/lib/rancher/k3s/server/db/etcd", "INIT_CONFIG": "/opt/rancher/k3s/init/config.yaml", "SERVER_CONFIG": "/opt/rancher/k3s/server/config.yaml", + "CLUSTER_MODE": mode, + "K3K_MODE": string(s.cluster.Spec.Mode), "EXTRA_ARGS": strings.Join(s.cluster.Spec.ServerArgs, " "), }); err != nil { return "", err diff --git a/pkg/controller/cluster/server/template.go b/pkg/controller/cluster/server/template.go index b472f68d..101de6cb 100644 --- a/pkg/controller/cluster/server/template.go +++ b/pkg/controller/cluster/server/template.go @@ -1,16 +1,102 @@ package server -var singleServerTemplate string = ` -if [ -d "{{.ETCD_DIR}}" ]; then - # if directory exists then it means its not an initial run - /bin/k3s server --cluster-reset --config {{.INIT_CONFIG}} {{.EXTRA_ARGS}} 2>&1 | tee /var/log/k3s.log -fi -rm -f /var/lib/rancher/k3s/server/db/reset-flag -/bin/k3s server --config {{.INIT_CONFIG}} {{.EXTRA_ARGS}} 2>&1 | tee /var/log/k3s.log` - -var HAServerTemplate string = ` -if [ ${POD_NAME: -1} == 0 ] && [ ! -d "{{.ETCD_DIR}}" ]; then +var StartupCommand string = ` +info() +{ + echo "[INFO] [$(date +"%c")]" "$@" +} + +fatal() +{ + echo "[FATAL] [$(date +"%c")] " "$@" >&2 + exit 1 +} + +# safe mode function to reset node IP after pod restarts +safe_mode() { + CURRENT_IP="" + if [ -f /var/lib/rancher/k3s/k3k-node-ip ]; then + CURRENT_IP=$(cat /var/lib/rancher/k3s/k3k-node-ip) + fi + + if [ -z "$CURRENT_IP" ] || [ "$CURRENT_IP" = "$POD_IP" ] || [ {{.K3K_MODE}} != "virtual" ]; then + return + fi + + # skipping if the node is starting for the first time + if [ -d "{{.ETCD_DIR}}" ]; then + + info "Starting K3s in Safe Mode (Network Policy Disabled) to patch Node IP from ${CURRENT_IP} to ${POD_IP}" + /bin/k3s server --disable-network-policy --config $1 {{.EXTRA_ARGS}} > /dev/null 2>&1 & + PID=$! + + # Start the loop to wait for the nodeIP to change + info "Waiting for Node IP to update to ${POD_IP}." + count=0 + until kubectl get nodes -o wide 2>/dev/null | grep -q "${POD_IP}"; do + if ! kill -0 $PID 2>/dev/null; then + fatal "safe Mode K3s process died unexpectedly!" + fi + sleep 2 + count=$((count+1)) + + if [ $count -gt 60 ]; then + fatal "timed out waiting for node to change IP from $CURRENT_IP to $POD_IP" + fi + done + + info "Node IP is set to ${POD_IP} successfully. Stopping Safe Mode process..." + kill $PID + wait $PID 2>/dev/null || true + fi +} + +start_single_node() { + info "Starting single node setup..." + + # checking for existing data in single server if found we must perform reset + if [ -d "{{.ETCD_DIR}}" ]; then + info "Existing data found in single node setup. Performing cluster-reset to ensure quorum..." + + if ! /bin/k3s server --cluster-reset --config {{.INIT_CONFIG}} {{.EXTRA_ARGS}} > /dev/null 2>&1; then + fatal "cluster reset failed!" + fi + info "Cluster reset complete. Removing Reset flag file." + rm -f /var/lib/rancher/k3s/server/db/reset-flag + fi + + # entering safe mode to ensure correct NodeIP + safe_mode {{.INIT_CONFIG}} + + info "Adding pod IP file." + echo $POD_IP > /var/lib/rancher/k3s/k3k-node-ip + /bin/k3s server --config {{.INIT_CONFIG}} {{.EXTRA_ARGS}} 2>&1 | tee /var/log/k3s.log -else - /bin/k3s server --config {{.SERVER_CONFIG}} {{.EXTRA_ARGS}} 2>&1 | tee /var/log/k3s.log -fi` +} + +start_ha_node() { + info "Starting pod $POD_NAME in HA node setup" + + if [ ${POD_NAME: -1} == 0 ] && [ ! -d "{{.ETCD_DIR}}" ]; then + info "Adding pod IP file." + echo $POD_IP > /var/lib/rancher/k3s/k3k-node-ip + + /bin/k3s server --config {{.INIT_CONFIG}} {{.EXTRA_ARGS}} 2>&1 | tee /var/log/k3s.log + else + safe_mode {{.SERVER_CONFIG}} + + info "Adding pod IP file." + echo $POD_IP > /var/lib/rancher/k3s/k3k-node-ip + + /bin/k3s server --config {{.SERVER_CONFIG}} {{.EXTRA_ARGS}} 2>&1 | tee /var/log/k3s.info + fi +} + +case "{{.CLUSTER_MODE}}" in + "ha") + start_ha_node + ;; + "single"|*) + start_single_node + ;; +esac` diff --git a/pkg/controller/cluster/statefulset.go b/pkg/controller/cluster/statefulset.go index f41c1d7d..3978bd8b 100644 --- a/pkg/controller/cluster/statefulset.go +++ b/pkg/controller/cluster/statefulset.go @@ -120,7 +120,7 @@ func (p *StatefulSetReconciler) handleServerPod(ctx context.Context, cluster v1b if pod.DeletionTimestamp.IsZero() { if controllerutil.AddFinalizer(pod, etcdPodFinalizerName) { - log.V(1).Info("Server Pod is being deleted. Removing finalizer", "pod", pod.Name, "namespace", pod.Namespace) + log.V(1).Info("Server Pod is being created. Adding finalizer", "pod", pod.Name, "namespace", pod.Namespace) return p.Client.Update(ctx, pod) } diff --git a/tests/cluster_update_test.go b/tests/cluster_update_test.go index 94856dc8..ff74a856 100644 --- a/tests/cluster_update_test.go +++ b/tests/cluster_update_test.go @@ -127,6 +127,11 @@ var _ = When("a shared mode cluster update its envs", Label(e2eTestLabel), Label serverPods := listServerPods(ctx, virtualCluster) g.Expect(len(serverPods)).To(Equal(1)) + serverPod := serverPods[0] + _, cond := pod.GetPodCondition(&serverPod.Status, v1.PodReady) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(BeEquivalentTo(metav1.ConditionTrue)) + serverEnv1, ok := getEnv(&serverPods[0], "TEST_SERVER_ENV_1") g.Expect(ok).To(BeTrue()) g.Expect(serverEnv1).To(Equal("upgraded")) @@ -145,6 +150,11 @@ var _ = When("a shared mode cluster update its envs", Label(e2eTestLabel), Label aPods := listAgentPods(ctx, virtualCluster) g.Expect(aPods).To(HaveLen(len(nodes.Items))) + agentPod := aPods[0] + _, cond = pod.GetPodCondition(&agentPod.Status, v1.PodReady) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(BeEquivalentTo(metav1.ConditionTrue)) + agentEnv1, ok := getEnv(&aPods[0], "TEST_AGENT_ENV_1") g.Expect(ok).To(BeTrue()) g.Expect(agentEnv1).To(Equal("upgraded")) @@ -213,6 +223,11 @@ var _ = When("a shared mode cluster update its server args", Label(e2eTestLabel) sPods := listServerPods(ctx, virtualCluster) g.Expect(len(sPods)).To(Equal(1)) + serverPod := sPods[0] + _, cond := pod.GetPodCondition(&serverPod.Status, v1.PodReady) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(BeEquivalentTo(metav1.ConditionTrue)) + g.Expect(isArgFound(&sPods[0], "--node-label=test_server=upgraded")).To(BeTrue()) }). WithPolling(time.Second * 2). @@ -330,6 +345,11 @@ var _ = When("a virtual mode cluster update its envs", Label(e2eTestLabel), Labe serverPods := listServerPods(ctx, virtualCluster) g.Expect(len(serverPods)).To(Equal(1)) + serverPod := serverPods[0] + _, cond := pod.GetPodCondition(&serverPod.Status, v1.PodReady) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(BeEquivalentTo(metav1.ConditionTrue)) + serverEnv1, ok := getEnv(&serverPods[0], "TEST_SERVER_ENV_1") g.Expect(ok).To(BeTrue()) g.Expect(serverEnv1).To(Equal("upgraded")) @@ -345,6 +365,11 @@ var _ = When("a virtual mode cluster update its envs", Label(e2eTestLabel), Labe aPods := listAgentPods(ctx, virtualCluster) g.Expect(len(aPods)).To(Equal(1)) + agentPod := aPods[0] + _, cond = pod.GetPodCondition(&agentPod.Status, v1.PodReady) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(BeEquivalentTo(metav1.ConditionTrue)) + agentEnv1, ok := getEnv(&aPods[0], "TEST_AGENT_ENV_1") g.Expect(ok).To(BeTrue()) g.Expect(agentEnv1).To(Equal("upgraded")) @@ -416,6 +441,11 @@ var _ = When("a virtual mode cluster update its server args", Label(e2eTestLabel sPods := listServerPods(ctx, virtualCluster) g.Expect(len(sPods)).To(Equal(1)) + serverPod := sPods[0] + _, cond := pod.GetPodCondition(&serverPod.Status, v1.PodReady) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(BeEquivalentTo(metav1.ConditionTrue)) + g.Expect(isArgFound(&sPods[0], "--node-label=test_server=upgraded")).To(BeTrue()) }). WithPolling(time.Second * 2). diff --git a/tests/common_test.go b/tests/common_test.go index 131d83cc..8651874f 100644 --- a/tests/common_test.go +++ b/tests/common_test.go @@ -140,9 +140,6 @@ func NewCluster(namespace string) *v1beta1.Cluster { Persistence: v1beta1.PersistenceConfig{ Type: v1beta1.EphemeralPersistenceMode, }, - ServerArgs: []string{ - "--disable-network-policy", - }, }, } }