CDMP Application DB

CDMP Application DB Monitoring

We have prepared a zabbix host record for monitoring db activity of several CDMP services. This host definition have the sections below:

  • Audience Monitoring
  • Automotion Monitoring
  • Database
  • Data Export Monitoring
  • Data Import Monitoring
  • Delivery Monitoring
  • Jobs Monitoring
  • Send Monitoring

To use this host definition, please import the xml below

cdmp-app-host.xml

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>5.0</version>
    <date>2019-06-21T10:18:33Z</date>
    <groups>
        <group>
            <name>ALERT/CDMP Applications</name>
        </group>
        <group>
            <name>ALERT/Global</name>
        </group>
        <group>
            <name>CDMP/Applications</name>
        </group>
        <group>
            <name>DB App Monitoring</name>
        </group>
    </groups>
    <hosts>
        <host>
            <host>CDMP App Related DB Monitoring</host>
            <name>CDMP App Related DB Monitoring</name>
            <groups>
                <group>
                    <name>ALERT/CDMP Applications</name>
                </group>
                <group>
                    <name>ALERT/Global</name>
                </group>
                <group>
                    <name>CDMP/Applications</name>
                </group>
                <group>
                    <name>DB App Monitoring</name>
                </group>
            </groups>
            <interfaces>
                <interface>
                    <ip>127.0.0.1</ip>
                    <interface_ref>if1</interface_ref>
                </interface>
            </interfaces>
            <applications>
                <application>
                    <name>Audience Monitoring</name>
                </application>
                <application>
                    <name>Automotion Monitoring</name>
                </application>
                <application>
                    <name>Database</name>
                </application>
                <application>
                    <name>Data Export Monitoring</name>
                </application>
                <application>
                    <name>Data Import Monitoring</name>
                </application>
                <application>
                    <name>Delivery Monitoring</name>
                </application>
                <application>
                    <name>Jobs Monitoring</name>
                </application>
                <application>
                    <name>Send Monitoring</name>
                </application>
            </applications>
            <items>
                <item>
                    <name>Audience Monitoring - LONG RUNNING</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;audience-monitoring-long-running&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*)&#13;
from sys.automation_audience_order&#13;
where status = 'PROCESSING' and process_start &lt; current_timestamp - (interval '20 minute')</params>
                    <applications>
                        <application>
                            <name>Audience Monitoring</name>
                        </application>
                        <application>
                            <name>Database</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Audience Monitoring - LONG-RUNNING</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Automotion Monitoring ERROR</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;automotion-monitoring-error&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.automated_job_instance where execution_time between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('ERROR')</params>
                    <applications>
                        <application>
                            <name>Automotion Monitoring</name>
                        </application>
                        <application>
                            <name>Database</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Automotion Monitoring - ERROR</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Automotion Monitoring LATE</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;automotion-monitoring-late&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.automated_job_instance where (public.now_acc(account_id) - execution_time)::interval &gt; (interval '5 minute') and (start_time is null or status = 'READY')</params>
                    <applications>
                        <application>
                            <name>Automotion Monitoring</name>
                        </application>
                        <application>
                            <name>Database</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Automotion Monitoring - LATE</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Automotion Monitoring LONG-RUNNING</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;automotion-monitoring-long-running&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*)&#13;
from sys.automated_job_instance as t1&#13;
        inner join sys.automated_job_action_wait_queue as t2 on t1.account_id = t2.account_id and t1.instance_id = t2.instance_id&#13;
where t1.status = 'RUNNING'&#13;
  and (public.now_acc(t1.account_id) - start_time)::interval &gt; (interval '24 hour')&#13;
  and t2.type = 'WAIT'&#13;
  and t2.status &lt;&gt; 'COMPLETED'&#13;
  and t2.wait_until is not null&#13;
  and t2.wait_until &lt; current_timestamp;</params>
                    <applications>
                        <application>
                            <name>Automotion Monitoring</name>
                        </application>
                        <application>
                            <name>Database</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Automotion Monitoring - LONG-RUNNING</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Data Export Monitoring ERROR</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;data-export-monitoring-error&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.data_export_order where process_start between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id)  and status in ('FAIL')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Data Export Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Data Export Monitoring - ERROR</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Data Export Monitoring LATE</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;data-export-monitoring-late&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.data_export_order where process_start is null and (public.now_acc(account_id) - created_at)::interval &gt; (interval '5 minute') and status in ('NEW')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Data Export Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Data Export Monitoring - LATE</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Data Export Monitoring LONG-RUNNING</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;data-export-monitoring-long-running&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.data_export_order where status = 'PROCESSING' and (public.now_acc(account_id) - process_start)::interval &gt; (interval '20 minute')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Data Export Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Data Export Monitoring - LONG-RUNNING</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Data Import Monitoring ERROR</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;data-import-monitoring-error&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.data_upload_order where process_start between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('ERROR')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Data Import Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Data Import Monitoring - ERROR</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Data Import Monitoring LATE</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;data-import-monitoring-late&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.data_upload_order where process_start is null and (public.now_acc(account_id) - created_at)::interval &gt; (interval '5 minute') and status in ('NEW') and account_id not in (select account_id from sys.data_upload_order where status = 'PROCESSING')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Data Import Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Data Import Monitoring - LATE</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Data Import Monitoring LONG-RUNNING</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;data-import-monitoring-long-running&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.data_upload_order where status = 'PROCESSING' and (public.now_acc(account_id) - process_start)::interval &gt; (interval '60 minute')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Data Import Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Data Import Monitoring - LONG-RUNNING</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Delivery Monitoring WARN</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;delivery-monitoring-warn&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select coalesce(count(*),0)&#13;
from sys.email_send_order&#13;
where delivery_date between now() - interval '60 mins' and now()&#13;
and error_count &gt; relayed_count * 0.30</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Delivery Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Application Delivery Monitoring - WARN</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Jobs Monitoring LONG-RUNNING</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;jobs-monitoring-long-running&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <status>DISABLED</status>
                    <params>select count(*)&#13;
from (&#13;
select (DATE_PART('day', coalesce(end_time,current_timestamp) - start_time) * 24 * 60 + &#13;
DATE_PART('hour', coalesce(end_time,current_timestamp) - start_time)) * 60 +&#13;
DATE_PART('minute',coalesce(end_time,current_timestamp) - start_time) as job_duration,&#13;
jobid,start_time,end_time,command &#13;
from cron.job_run_details &#13;
where date(start_time) = current_date&#13;
) x&#13;
where x.job_duration &gt; 5;</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Jobs Monitoring</name>
                        </application>
                    </applications>
                </item>
                <item>
                    <name>Push Send Monitoring LATE</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;push-send-monitoring-late&gt;,pgres-cluster]</key>
                    <params>select count(*)&#13;
from sys.push_send_order&#13;
where (public.now_acc(account_id) - delivery_date)::interval &gt; (interval '5 minute')&#13;
  and status in ('READY')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Send Monitoring</name>
                        </application>
                    </applications>
                </item>
                <item>
                    <name>Send Monitoring ERROR</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;send-monitoring-error&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.email_send_order where delivery_date between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('FINAL_ERROR')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Send Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Application Send Monitoring - ERROR</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Send Monitoring INCOMPLETE</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;send-monitoring-incomplete&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*)&#13;
from sys.email_send_order&#13;
where (public.now_acc(account_id) - delivery_date)::interval &gt; (interval '16 hour')&#13;
and speed = 1&#13;
and status not in ('COMPLETED', 'CANCELLED','FINAL_ERROR');</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Send Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Application Send Monitoring - INCOMPLETE</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Send Monitoring LATE</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;send-monitoring-late&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select count(*) from sys.email_send_order where (public.now_acc(account_id) - delivery_date)::interval &gt; (interval '5 minute') and status in ('READY')</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Send Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Application Send Monitoring - LATE</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
                <item>
                    <name>Send Monitoring LONG-RUNNING</name>
                    <type>ODBC</type>
                    <key>db.odbc.select[&lt;send-monitoring-long-running&gt;,pgres-cluster]</key>
                    <history>60d</history>
                    <trends>60d</trends>
                    <params>select sum(&quot;count&quot;) as send_count&#13;
from(&#13;
select count(*) as &quot;count&quot;&#13;
from sys.email_send_order&#13;
where status in ('PREPARING')&#13;
  and (public.now_acc(account_id) - delivery_date)::interval &gt; (interval '20 minute')&#13;
union&#13;
select count(*) as &quot;count&quot;&#13;
from sys.email_send_order&#13;
where status in ('PREPARED')&#13;
  and (public.now_acc(account_id) - updated_at)::interval &gt; (interval '30 minute')&#13;
union&#13;
select count(*) as &quot;count&quot;&#13;
from sys.email_send_order&#13;
where status ='PREPARED'&#13;
    and relayed_count = 0&#13;
  and (public.now_acc(account_id) - updated_at)::interval &gt; (interval '5 minute')&#13;
    )as A;</params>
                    <applications>
                        <application>
                            <name>Database</name>
                        </application>
                        <application>
                            <name>Send Monitoring</name>
                        </application>
                    </applications>
                    <triggers>
                        <trigger>
                            <expression>{last(#2)}&gt;0</expression>
                            <recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
                            <recovery_expression>{last(#1)}=0</recovery_expression>
                            <name>Application Send Monitoring - LONG-RUNNING</name>
                            <priority>DISASTER</priority>
                        </trigger>
                    </triggers>
                </item>
            </items>
            <inventory_mode>DISABLED</inventory_mode>
        </host>
    </hosts>
</zabbix_export>

You have to create a zabbix_user on you postgresql first. Then yo have to give a select permission to related tables. On onpremise setups this user should be already created on deployment time.

On zabbix-server, please append the content below to your /etc/odbc.ini file.

[pgres-cluster]
Description = PostgreSQL database cluster dengage db
Driver  = postgresql
Username = zabbix_user
Password = {zabbix_user_password_here}
Servername = {postgresql_cluster_vip_here}
Port = {postgresql_cluster_port_here_5001}
Database = dengage
ReadOnly = Yes
Protocol = 7.4+
ShowOidColumn = No
FakeOidIndex  = No
RowVersioning = No
ShowSystemTables = No
Fetch = Yes
BoolsAsChar = Yes
ConnSettings =

[pgres-cluster-postgres]
Description = PostgreSQL database cluster dengage db
Driver  = postgresql
Username = zabbix_user
Password = {zabbix_user_password_here}
Servername = {postgresql_cluster_vip_here}
Port = {postgresql_cluster_port_here_5001}
Database = postgres
ReadOnly = Yes
Protocol = 7.4+
ShowOidColumn = No
FakeOidIndex  = No
RowVersioning = No
ShowSystemTables = No
Fetch = Yes
BoolsAsChar = Yes
ConnSettings =

[pgres-cluster-dcloud]
Description = PostgreSQL database cluster - dcloud db
Driver  = postgresql
Username = zabbix_user
Password = {zabbix_user_password_here}
Servername = {postgresql_cluster_vip_here}
Port = {postgresql_cluster_port_here_5001}
Database = dcloud
ReadOnly = Yes
Protocol = 7.4+
ShowOidColumn = No
FakeOidIndex  = No
RowVersioning = No
ShowSystemTables = No
Fetch = Yes
BoolsAsChar = Yes
ConnSettings =

and make sure the [PostgreSQL] section exists on /etc/odbcinst.ini

[PostgreSQL]
Description = General ODBC for PostgreSQL
Driver      = /usr/pgsql-12/lib/psqlodbcw.so
Setup       = /usr/lib64/libodbcpsqlS.so
FileUsage   = 1
Threading   = 2