CDMP Application DB Monitoring
We have prepared a zabbix host record for monitoring db activity of several CDMP services. This host definition have the sections below:
- Audience Monitoring
- Automotion Monitoring
- Database
- Data Export Monitoring
- Data Import Monitoring
- Delivery Monitoring
- Jobs Monitoring
- Send Monitoring
To use this host definition, please import the xml below
cdmp-app-host.xml
<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>5.0</version>
<date>2019-06-21T10:18:33Z</date>
<groups>
<group>
<name>ALERT/CDMP Applications</name>
</group>
<group>
<name>ALERT/Global</name>
</group>
<group>
<name>CDMP/Applications</name>
</group>
<group>
<name>DB App Monitoring</name>
</group>
</groups>
<hosts>
<host>
<host>CDMP App Related DB Monitoring</host>
<name>CDMP App Related DB Monitoring</name>
<groups>
<group>
<name>ALERT/CDMP Applications</name>
</group>
<group>
<name>ALERT/Global</name>
</group>
<group>
<name>CDMP/Applications</name>
</group>
<group>
<name>DB App Monitoring</name>
</group>
</groups>
<interfaces>
<interface>
<ip>127.0.0.1</ip>
<interface_ref>if1</interface_ref>
</interface>
</interfaces>
<applications>
<application>
<name>Audience Monitoring</name>
</application>
<application>
<name>Automotion Monitoring</name>
</application>
<application>
<name>Database</name>
</application>
<application>
<name>Data Export Monitoring</name>
</application>
<application>
<name>Data Import Monitoring</name>
</application>
<application>
<name>Delivery Monitoring</name>
</application>
<application>
<name>Jobs Monitoring</name>
</application>
<application>
<name>Send Monitoring</name>
</application>
</applications>
<items>
<item>
<name>Audience Monitoring - LONG RUNNING</name>
<type>ODBC</type>
<key>db.odbc.select[<audience-monitoring-long-running>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*)
from sys.automation_audience_order
where status = 'PROCESSING' and process_start < current_timestamp - (interval '20 minute')</params>
<applications>
<application>
<name>Audience Monitoring</name>
</application>
<application>
<name>Database</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Audience Monitoring - LONG-RUNNING</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Automotion Monitoring ERROR</name>
<type>ODBC</type>
<key>db.odbc.select[<automotion-monitoring-error>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.automated_job_instance where execution_time between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('ERROR')</params>
<applications>
<application>
<name>Automotion Monitoring</name>
</application>
<application>
<name>Database</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Automotion Monitoring - ERROR</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Automotion Monitoring LATE</name>
<type>ODBC</type>
<key>db.odbc.select[<automotion-monitoring-late>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.automated_job_instance where (public.now_acc(account_id) - execution_time)::interval > (interval '5 minute') and (start_time is null or status = 'READY')</params>
<applications>
<application>
<name>Automotion Monitoring</name>
</application>
<application>
<name>Database</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Automotion Monitoring - LATE</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Automotion Monitoring LONG-RUNNING</name>
<type>ODBC</type>
<key>db.odbc.select[<automotion-monitoring-long-running>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*)
from sys.automated_job_instance as t1
inner join sys.automated_job_action_wait_queue as t2 on t1.account_id = t2.account_id and t1.instance_id = t2.instance_id
where t1.status = 'RUNNING'
and (public.now_acc(t1.account_id) - start_time)::interval > (interval '24 hour')
and t2.type = 'WAIT'
and t2.status <> 'COMPLETED'
and t2.wait_until is not null
and t2.wait_until < current_timestamp;</params>
<applications>
<application>
<name>Automotion Monitoring</name>
</application>
<application>
<name>Database</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Automotion Monitoring - LONG-RUNNING</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Data Export Monitoring ERROR</name>
<type>ODBC</type>
<key>db.odbc.select[<data-export-monitoring-error>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.data_export_order where process_start between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('FAIL')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Data Export Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Data Export Monitoring - ERROR</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Data Export Monitoring LATE</name>
<type>ODBC</type>
<key>db.odbc.select[<data-export-monitoring-late>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.data_export_order where process_start is null and (public.now_acc(account_id) - created_at)::interval > (interval '5 minute') and status in ('NEW')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Data Export Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Data Export Monitoring - LATE</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Data Export Monitoring LONG-RUNNING</name>
<type>ODBC</type>
<key>db.odbc.select[<data-export-monitoring-long-running>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.data_export_order where status = 'PROCESSING' and (public.now_acc(account_id) - process_start)::interval > (interval '20 minute')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Data Export Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Data Export Monitoring - LONG-RUNNING</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Data Import Monitoring ERROR</name>
<type>ODBC</type>
<key>db.odbc.select[<data-import-monitoring-error>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.data_upload_order where process_start between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('ERROR')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Data Import Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Data Import Monitoring - ERROR</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Data Import Monitoring LATE</name>
<type>ODBC</type>
<key>db.odbc.select[<data-import-monitoring-late>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.data_upload_order where process_start is null and (public.now_acc(account_id) - created_at)::interval > (interval '5 minute') and status in ('NEW') and account_id not in (select account_id from sys.data_upload_order where status = 'PROCESSING')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Data Import Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Data Import Monitoring - LATE</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Data Import Monitoring LONG-RUNNING</name>
<type>ODBC</type>
<key>db.odbc.select[<data-import-monitoring-long-running>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.data_upload_order where status = 'PROCESSING' and (public.now_acc(account_id) - process_start)::interval > (interval '60 minute')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Data Import Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Data Import Monitoring - LONG-RUNNING</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Delivery Monitoring WARN</name>
<type>ODBC</type>
<key>db.odbc.select[<delivery-monitoring-warn>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select coalesce(count(*),0)
from sys.email_send_order
where delivery_date between now() - interval '60 mins' and now()
and error_count > relayed_count * 0.30</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Delivery Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Application Delivery Monitoring - WARN</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Jobs Monitoring LONG-RUNNING</name>
<type>ODBC</type>
<key>db.odbc.select[<jobs-monitoring-long-running>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<status>DISABLED</status>
<params>select count(*)
from (
select (DATE_PART('day', coalesce(end_time,current_timestamp) - start_time) * 24 * 60 +
DATE_PART('hour', coalesce(end_time,current_timestamp) - start_time)) * 60 +
DATE_PART('minute',coalesce(end_time,current_timestamp) - start_time) as job_duration,
jobid,start_time,end_time,command
from cron.job_run_details
where date(start_time) = current_date
) x
where x.job_duration > 5;</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Jobs Monitoring</name>
</application>
</applications>
</item>
<item>
<name>Push Send Monitoring LATE</name>
<type>ODBC</type>
<key>db.odbc.select[<push-send-monitoring-late>,pgres-cluster]</key>
<params>select count(*)
from sys.push_send_order
where (public.now_acc(account_id) - delivery_date)::interval > (interval '5 minute')
and status in ('READY')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Send Monitoring</name>
</application>
</applications>
</item>
<item>
<name>Send Monitoring ERROR</name>
<type>ODBC</type>
<key>db.odbc.select[<send-monitoring-error>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.email_send_order where delivery_date between public.now_acc(account_id) - (interval '1 hour') and public.now_acc(account_id) and status in ('FINAL_ERROR')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Send Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Application Send Monitoring - ERROR</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Send Monitoring INCOMPLETE</name>
<type>ODBC</type>
<key>db.odbc.select[<send-monitoring-incomplete>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*)
from sys.email_send_order
where (public.now_acc(account_id) - delivery_date)::interval > (interval '16 hour')
and speed = 1
and status not in ('COMPLETED', 'CANCELLED','FINAL_ERROR');</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Send Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Application Send Monitoring - INCOMPLETE</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Send Monitoring LATE</name>
<type>ODBC</type>
<key>db.odbc.select[<send-monitoring-late>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select count(*) from sys.email_send_order where (public.now_acc(account_id) - delivery_date)::interval > (interval '5 minute') and status in ('READY')</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Send Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Application Send Monitoring - LATE</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
<item>
<name>Send Monitoring LONG-RUNNING</name>
<type>ODBC</type>
<key>db.odbc.select[<send-monitoring-long-running>,pgres-cluster]</key>
<history>60d</history>
<trends>60d</trends>
<params>select sum("count") as send_count
from(
select count(*) as "count"
from sys.email_send_order
where status in ('PREPARING')
and (public.now_acc(account_id) - delivery_date)::interval > (interval '20 minute')
union
select count(*) as "count"
from sys.email_send_order
where status in ('PREPARED')
and (public.now_acc(account_id) - updated_at)::interval > (interval '30 minute')
union
select count(*) as "count"
from sys.email_send_order
where status ='PREPARED'
and relayed_count = 0
and (public.now_acc(account_id) - updated_at)::interval > (interval '5 minute')
)as A;</params>
<applications>
<application>
<name>Database</name>
</application>
<application>
<name>Send Monitoring</name>
</application>
</applications>
<triggers>
<trigger>
<expression>{last(#2)}>0</expression>
<recovery_mode>RECOVERY_EXPRESSION</recovery_mode>
<recovery_expression>{last(#1)}=0</recovery_expression>
<name>Application Send Monitoring - LONG-RUNNING</name>
<priority>DISASTER</priority>
</trigger>
</triggers>
</item>
</items>
<inventory_mode>DISABLED</inventory_mode>
</host>
</hosts>
</zabbix_export>
You have to create a zabbix_user on you postgresql first. Then yo have to give a select permission to related tables. On onpremise setups this user should be already created on deployment time.
On zabbix-server, please append the content below to your /etc/odbc.ini
file.
[pgres-cluster]
Description = PostgreSQL database cluster dengage db
Driver = postgresql
Username = zabbix_user
Password = {zabbix_user_password_here}
Servername = {postgresql_cluster_vip_here}
Port = {postgresql_cluster_port_here_5001}
Database = dengage
ReadOnly = Yes
Protocol = 7.4+
ShowOidColumn = No
FakeOidIndex = No
RowVersioning = No
ShowSystemTables = No
Fetch = Yes
BoolsAsChar = Yes
ConnSettings =
[pgres-cluster-postgres]
Description = PostgreSQL database cluster dengage db
Driver = postgresql
Username = zabbix_user
Password = {zabbix_user_password_here}
Servername = {postgresql_cluster_vip_here}
Port = {postgresql_cluster_port_here_5001}
Database = postgres
ReadOnly = Yes
Protocol = 7.4+
ShowOidColumn = No
FakeOidIndex = No
RowVersioning = No
ShowSystemTables = No
Fetch = Yes
BoolsAsChar = Yes
ConnSettings =
[pgres-cluster-dcloud]
Description = PostgreSQL database cluster - dcloud db
Driver = postgresql
Username = zabbix_user
Password = {zabbix_user_password_here}
Servername = {postgresql_cluster_vip_here}
Port = {postgresql_cluster_port_here_5001}
Database = dcloud
ReadOnly = Yes
Protocol = 7.4+
ShowOidColumn = No
FakeOidIndex = No
RowVersioning = No
ShowSystemTables = No
Fetch = Yes
BoolsAsChar = Yes
ConnSettings =
and make sure the [PostgreSQL] section exists on /etc/odbcinst.ini
[PostgreSQL]
Description = General ODBC for PostgreSQL
Driver = /usr/pgsql-12/lib/psqlodbcw.so
Setup = /usr/lib64/libodbcpsqlS.so
FileUsage = 1
Threading = 2