/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.ignite.internal.worker;

import static org.apache.ignite.internal.failure.FailureType.CRITICAL_ERROR;
import static org.apache.ignite.internal.failure.FailureType.SYSTEM_WORKER_BLOCKED;
import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;

import it.unimi.dsi.fastutil.longs.Long2LongMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import java.lang.management.LockInfo;
import java.lang.management.ManagementFactory;
import java.lang.management.MonitorInfo;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import org.apache.ignite.internal.failure.FailureContext;
import org.apache.ignite.internal.failure.FailureManager;
import org.apache.ignite.internal.logger.IgniteLogger;
import org.apache.ignite.internal.logger.Loggers;
import org.apache.ignite.internal.manager.ComponentContext;
import org.apache.ignite.internal.manager.IgniteComponent;
import org.apache.ignite.internal.worker.configuration.CriticalWorkersConfiguration;
import org.jetbrains.annotations.Nullable;

/**
 * A watchdog that monitors liveness of the registered workers and, if a worker is suspected to be blocked, logs the corresponding
 * information (including the stack trace corresponding to the worker's thread).
 *
 * <p>Each worker is expected to maintain its {@link CriticalWorker#heartbeatNanos()} growing while the worker executes some computations.
 * If the worker does not do any computations (it is blocked on an I/O operation, waits for a lock, or has no work to do),
 * it must set its {@link CriticalWorker#heartbeatNanos()} to {@link CriticalWorker#NOT_MONITORED}.
 *
 * <p>The watchdog periodically performs a check; if it finds a worker that lags more than allowed and it is not in the
 * NOT_MONITORED state, then a logging is triggered.
 */
public class CriticalWorkerWatchdog implements CriticalWorkerRegistry, IgniteComponent {
    private static final IgniteLogger LOG = Loggers.forClass(CriticalWorkerWatchdog.class);

    private final CriticalWorkersConfiguration configuration;

    private final ScheduledExecutorService scheduler;

    private final Set<CriticalWorker> registeredWorkers = ConcurrentHashMap.newKeySet();

    @Nullable
    private volatile ScheduledFuture<?> livenessProbeTaskFuture;

    private final ThreadMXBean threadMxBean = ManagementFactory.getThreadMXBean();

    private final FailureManager failureManager;

    /**
     * Creates a new instance of the watchdog.
     *
     * @param configuration Configuration.
     * @param scheduler Scheduler.
     * @param failureManager Failure processor.
     */
    public CriticalWorkerWatchdog(
            CriticalWorkersConfiguration configuration,
            ScheduledExecutorService scheduler,
            FailureManager failureManager
    ) {
        this.configuration = configuration;
        this.scheduler = scheduler;
        this.failureManager = failureManager;
    }

    @Override
    public void register(CriticalWorker worker) {
        registeredWorkers.add(worker);
    }

    @Override
    public void unregister(CriticalWorker worker) {
        registeredWorkers.remove(worker);
    }

    @Override
    public CompletableFuture<Void> startAsync(ComponentContext componentContext) {
        long livenessCheckIntervalMs = configuration.livenessCheckIntervalMillis().value();

        livenessProbeTaskFuture = scheduler.scheduleAtFixedRate(
                this::probeLiveness,
                livenessCheckIntervalMs,
                livenessCheckIntervalMs,
                TimeUnit.MILLISECONDS
        );

        return nullCompletedFuture();
    }

    private void probeLiveness() {
        try {
            doProbeLiveness();
        } catch (Exception | AssertionError e) {
            LOG.debug("Error while probing liveness", e);
        } catch (Error e) {
            failureManager.process(new FailureContext(CRITICAL_ERROR, e));
        }
    }

    private void doProbeLiveness() {
        long maxAllowedLag = configuration.maxAllowedLagMillis().value();

        Long2LongMap delayedThreadIdsToDelays = getDelayedThreadIdsAndDelays(maxAllowedLag);

        if (delayedThreadIdsToDelays == null) {
            return;
        }

        ThreadInfo[] threadInfos = threadMxBean.getThreadInfo(
                delayedThreadIdsToDelays.keySet().toLongArray(),
                threadMxBean.isObjectMonitorUsageSupported(),
                threadMxBean.isSynchronizerUsageSupported());

        for (ThreadInfo threadInfo : threadInfos) {
            if (threadInfo != null) {
                StringBuilder message = new StringBuilder()
                        .append("A critical thread is blocked for ")
                        .append(delayedThreadIdsToDelays.get(threadInfo.getThreadId()))
                        .append(" ms that is more than the allowed ")
                        .append(maxAllowedLag)
                        .append(" ms (defined at ignite.system.criticalWorkers.maxAllowedLagMillis local config property), it is ");

                appendThreadInfo(message, threadInfo);

                failureManager.process(new FailureContext(SYSTEM_WORKER_BLOCKED, null, message.toString()));
            }
        }
    }

    @Nullable
    private Long2LongMap getDelayedThreadIdsAndDelays(long maxAllowedLag) {
        long nowNanos = System.nanoTime();

        Long2LongMap delayedThreadIdsToDelays = null;

        for (CriticalWorker worker : registeredWorkers) {
            long heartbeatNanos = worker.heartbeatNanos();

            if (heartbeatNanos == CriticalWorker.NOT_MONITORED) {
                continue;
            }

            long delayMillis = TimeUnit.NANOSECONDS.toMillis(nowNanos - heartbeatNanos);
            if (delayMillis > maxAllowedLag) {
                if (delayedThreadIdsToDelays == null) {
                    delayedThreadIdsToDelays = new Long2LongOpenHashMap();
                }

                delayedThreadIdsToDelays.put(worker.threadId(), delayMillis);
            }
        }

        return delayedThreadIdsToDelays;
    }

    private static void appendThreadInfo(StringBuilder sb, ThreadInfo threadInfo) {
        // This method is based on code taken from ThreadInfo#toString(). The original method limits the depth of the
        // stacktrace it includes in the string representation to just 8 frames, which is too few. Here, we
        // removed this limitation and include the stack trace in its entirety.

        sb
                .append('\"').append(threadInfo.getThreadName()).append('\"')
                .append(threadInfo.isDaemon() ? " daemon" : "")
                .append(" prio=").append(threadInfo.getPriority())
                .append(" Id=").append(threadInfo.getThreadId()).append(' ')
                .append(threadInfo.getThreadState());

        if (threadInfo.getLockName() != null) {
            sb.append(" on ").append(threadInfo.getLockName());
        }
        if (threadInfo.getLockOwnerName() != null) {
            sb.append(" owned by \"").append(threadInfo.getLockOwnerName())
                    .append("\" Id=").append(threadInfo.getLockOwnerId());
        }
        if (threadInfo.isSuspended()) {
            sb.append(" (suspended)");
        }
        if (threadInfo.isInNative()) {
            sb.append(" (in native)");
        }
        sb.append('\n');
        int i = 0;
        for (; i < threadInfo.getStackTrace().length; i++) {
            StackTraceElement ste = threadInfo.getStackTrace()[i];
            sb.append("\tat ").append(ste.toString()).append('\n');

            if (i == 0 && threadInfo.getLockInfo() != null) {
                Thread.State ts = threadInfo.getThreadState();
                switch (ts) {
                    case BLOCKED:
                        sb.append("\t-  blocked on ").append(threadInfo.getLockInfo()).append('\n');
                        break;
                    case WAITING:
                    case TIMED_WAITING:
                        sb.append("\t-  waiting on ").append(threadInfo.getLockInfo()).append('\n');
                        break;
                    default:
                }
            }

            for (MonitorInfo mi : threadInfo.getLockedMonitors()) {
                if (mi.getLockedStackDepth() == i) {
                    sb.append("\t-  locked ").append(mi).append('\n');
                }
            }
        }

        LockInfo[] locks = threadInfo.getLockedSynchronizers();
        if (locks.length > 0) {
            sb.append("\n\tNumber of locked synchronizers = ").append(locks.length).append('\n');
            for (LockInfo li : locks) {
                sb.append("\t- ").append(li).append('\n');
            }
        }
        sb.append('\n');
    }

    @Override
    public CompletableFuture<Void> stopAsync(ComponentContext componentContext) {
        ScheduledFuture<?> taskFuture = livenessProbeTaskFuture;
        if (taskFuture != null) {
            taskFuture.cancel(false);
        }
        return nullCompletedFuture();
    }
}
