Anonymous
×
Create a new article
Write your page title here:
We currently have 9 articles on NixSec. Type your article name above or click on one of the titles below and start writing!



    NixSec
    Revision as of 12:35, 4 December 2020 by DJ-ArcAngel (talk | contribs)
    (diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

    Slurm Monitoring

    The manual is for Centos 7

    This howto assumes you already have a node up and running with slurm installed on it and the correct munge key.

    slurmd does not have to be running, but it needs access to the slurm commands like squeue, sinfo, etc.

    Grafana

    Add the grafana repository /etc/yum.repos.d/grafana.repo

    [grafana]
    name=grafana
    baseurl=https://packages.grafana.com/oss/rpm
    repo_gpgcheck=1
    enabled=1
    gpgcheck=1
    gpgkey=https://packages.grafana.com/gpg.key
    sslverify=1
    sslcacert=/etc/pki/tls/certs/ca-bundle.crt
    

    Install grafana

    # yum -y install grafana
     
    # systemctl enable --now grafana-server
    

    Open firewall

    # firewall-cmd --add-port=3000/tcp --permanent
    # firewall-cmd --reload
    

    Once the service has been started, you can access its web dashboard by visiting http://[serverip|hostname]:3000.


    Prometheus Slurm Exporter

    Install dependancies

    # yum install go
    

    Clone prometheus slurm exporter from git

    # git clone https://github.com/vpenso/prometheus-slurm-exporter.git
    
     
    # cd prometheus-slurm-exporter
    # go mod download
    # go build -o bin/prometheus-slurm-exporter {main,accounts,cpus,nodes,partitions,queue,scheduler,users}.go
     
    # cp bin/prometheus-slurm-exporter /usr/local/bin
    


    Create systemd file /etc/systemd/system/prometheus-slurm.service

    [Unit]
    Description=Prometheus-Slurm
    Documentation=https://prometheus.io/docs/introduction/overview/
    Wants=network-online.target
    After=network-online.target
     
    [Service]
    Type=notify
    Environment="GOMAXPROCS=2"
    User=prometheus
    Group=prometheus
    ExecReload=/bin/kill -HUP $MAINPID
    ExecStart=/usr/local/bin/prometheus-slurm-exporter
     
    SyslogIdentifier=prometheus-slurm
    Restart=always
     
    [Install]
    WantedBy=multi-user.target
    Prometheus Framework
    

    Prometheus Framework

    Create user and group

    # groupadd --system prometheus
    # useradd -s /sbin/nologin --system -g prometheus prometheus
    

    Create directories

    # mkdir /var/lib/prometheus
    # for i in rules rules.d files_sd; do sudo mkdir -p /etc/prometheus/${i}; done
    

    Download prometheus

    # yum -y install wget
     
    # mkdir -p /tmp/prometheus && cd /tmp/prometheus
    # curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest \
        | grep browser_download_url \
        | grep linux-amd64 \
        | cut -d '"' -f 4 \
        | wget -qi -
    

    Extract the file

    # tar xvf prometheus*.tar.gz
    # cd prometheus*/
     
    # mv prometheus promtool /usr/local/bin/
    # mv prometheus.yml  /etc/prometheus/prometheus.yml
    # mv consoles/ console_libraries/ /etc/prometheus/
    


    Create the systemd file /etc/systemd/system/prometheus.service

    [Unit]
    Description=Prometheus
    Documentation=https://prometheus.io/docs/introduction/overview/
    Wants=network-online.target
    After=network-online.target
     
    [Service]
    Type=simple
    Environment="GOMAXPROCS=1"
    User=prometheus
    Group=prometheus
    ExecReload=/bin/kill -HUP $MAINPID
    ExecStart=/usr/local/bin/prometheus \
      --config.file=/etc/prometheus/prometheus.yml \
      --storage.tsdb.path=/var/lib/prometheus \
      --web.console.templates=/etc/prometheus/consoles \
      --web.console.libraries=/etc/prometheus/console_libraries \
      --web.listen-address=0.0.0.0:9090 \
      --web.external-url=
     
    SyslogIdentifier=prometheus
    Restart=always
     
    [Install]
    WantedBy=multi-user.target
    


    Change directory permissions

    # for i in rules rules.d files_sd; do sudo chown -R prometheus:prometheus /etc/prometheus/${i}; done
    # for i in rules rules.d files_sd; do sudo chmod -R 775 /etc/prometheus/${i}; done
    # chown -R prometheus:prometheus /var/lib/prometheus/
    

    Prometheus config /etc/prometheus/prometheus.yml

    # my global config
    global:
      scrape_interval:     30s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
     
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          # - alertmanager:9093
     
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      # - "first_rules.yml"
      # - "second_rules.yml"
     
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
     
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
     
        static_configs:
        - targets: ['0.0.0.0:9090']
     
      - job_name: 'slurm_exporter'
        scrape_interval: 5s
        static_configs:
          - targets: ['localhost:8080']
     
      - job_name: 'grafana'
        scrape_interval: 5s
        metrics_path: /grafana/metrics
        static_configs:
          - targets: ['localhost:80']
    


    Reload and start daemon service

    # systemctl daemon-reload
    # systemctl enable --now prometheus
    # systemctl enable --now prometheus-slurm
    


    Configure the firewall

    # firewall-cmd --add-port=9090/tcp --permanent
    # firewall-cmd --reload
    


    Install nginx

    This is needed because you will get a CORS error (Cross Origin Scripting) and grafana does not have a setting for this itself.

    # yum install nginx
    


    Nginx conf /etc/nginx/conf.d/grafana.conf

    server {
        listen       8008;
        root         /var/www/html;
        access_log  /var/log/nginx/access.log main;
     
     
        location / {
            add_header 'Access-Control-Allow-Origin' '*';
            proxy_pass  http://127.0.0.1:9090;
        }
    }
    


    Enable nginx

    # systemctl enable --now nginx
    


    Add grafana slurm dashboard

    Goto datasources and add prometheus datasource caption

    Then import the slurm dashboard caption

    You are all done and now can view your HPC cluster at http://<YOURIP>:3000