Every 5-7 days or so my Zabbix Server 6.2.7 Running on Ubuntu 20.04 using Aurora RDS crashes with the same error, pasted below. I cannot even restart the Zabbix Server process, I have to forcefully reboot the EC2 instance to recover - I have no idea why this is happening but it seems to be fairly consistently every 5-7 days, I haven't been able to peg down any corollaries....Any guidance would be super duper helpful!
13534:20231010:083149.667 slow query: 3.148626 sec, "begin;"
13515:20231010:083154.439 slow query: 6.648752 sec, "select ip.itemid,ip.step,ip.type,ip.params,ip.error_handl er,ip.error_handler_params from item_preproc ip,item_discovery id where ip.itemid=id.itemid and id.parent_itemid=183705"
13542:20231010:083155.076 slow query: 4.471124 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=0 and nextcheck<=1696926713 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
13543:20231010:083155.312 slow query: 4.481551 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=1 and nextcheck<=1696926713 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
14140:20231010:083156.689 slow query: 3.049810 sec, "select t.taskid,t.type,t.clock,t.ttl,c.command_type,c.exe cute_on,c.port,c.authtype,c.username,c.password,c. publickey,c.privatekey,c.command,c.alertid,c.paren t_taskid,c.hostid,cn.itemid,d.data,d.parent_taskid ,d.type from task t left join task_remote_command c on t.taskid=c.taskid left join task_check_now cn on t.taskid=cn.taskid left join task_data d on t.taskid=d.taskid where t.status=1 and t.proxy_hostid=14237 and (t.ttl=0 or t.clock+t.ttl>1696926713) order by t.taskid"
13499:20231010:083157.476 slow query: 3.528084 sec, "begin;"
14211:20231010:083157.521 failed to accept an incoming connection: from -------- : unspecified certificate verification error: TLS handshake set result code to 5:
13535:20231010:083158.886 slow query: 5.219143 sec, "begin;"
13524:20231010:083158.895 slow query: 4.498533 sec, "commit;"
13546:20231010:083159.251 slow query: 5.569650 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=4 and nextcheck<=1696926716 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
13534:20231010:083159.762 slow query: 4.433611 sec, "begin;"
14119:20231010:083202.509 slow query: 19.266609 sec, "select t.itemid,t.type,t.snmp_oid,t.hostid,t.key_,t.delay ,t.history,t.status,t.value_type,t.trapper_hosts,t .logtimefmt,t.params,t.ipmi_sensor,t.authtype,t.us ername,t.password,t.publickey,t.privatekey,t.flags ,t.interfaceid,t.inventory_link,t.jmx_endpoint,t.m aster_itemid,t.timeout,t.url,t.query_fields,t.post s,t.status_codes,t.follow_redirects,t.post_type,t. http_proxy,t.headers,t.retrieve_mode,t.request_met hod,t.output_format,t.ssl_cert_file,t.ssl_key_file ,t.ssl_key_password,t.verify_peer,t.verify_host,t. allow_traps from items t,hosts r where t.hostid=r.hostid and r.proxy_hostid=10560 and r.status in (0,1) and t.flags<>2 and t.type in (0,7,20,12,2,3,9,10,11,13,14,16,17,5,19,18,21) order by t.itemid"
13498:20231010:083202.512 One child process died (PID:13501,exitcode/signal:9). Exiting ...
13676:20231010:083202.555 slow query: 3.356094 sec, "select t.taskid,t.type,t.clock,t.ttl,c.command_type,c.exe cute_on,c.port,c.authtype,c.username,c.password,c. publickey,c.privatekey,c.command,c.alertid,c.paren t_taskid,c.hostid,cn.itemid,d.data,d.parent_taskid ,d.type from task t left join task_remote_command c on t.taskid=c.taskid left join task_check_now cn on t.taskid=cn.taskid left join task_data d on t.taskid=d.taskid where t.status=1 and t.proxy_hostid=10560 and (t.ttl=0 or t.clock+t.ttl>1696926719) order by t.taskid"
13523:20231010:083202.559 slow query: 3.670100 sec, "begin;"
13928:20231010:083202.596 failed to accept an incoming connection: from ------------ : SSL_accept() timed out
13545:20231010:083202.599 slow query: 3.146755 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=3 and nextcheck<=1696926722 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
13543:20231010:083202.713 slow query: 4.892580 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is null and itemid is not null and mod(itemid,5)=1 and nextcheck<=1696926720 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
14154:20231010:083202.749 failed to accept an incoming connection: from ----------- : SSL_accept() timed out
13498:20231010:083202.952 PROCESS EXIT: 13501
13499:20231010:083202.952 HA manager has been paused
zabbix_server [13498]: Error waiting for process with PID 13501: [10] No child processes
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
13498:20231010:083204.486 syncing history data...
13498:20231010:083204.497 [Z3008] query failed due to primary key constraint: [1062] Duplicate entry '3003336-1696926707-415114176' for key 'history.PRIMARY'
13498:20231010:083204.537 skipped 932 duplicates
13498:20231010:083305.009 cannot connect to service manager service: Cannot connect to service "service": [111] Connection refused.
13515:20231010:083154.439 slow query: 6.648752 sec, "select ip.itemid,ip.step,ip.type,ip.params,ip.error_handl er,ip.error_handler_params from item_preproc ip,item_discovery id where ip.itemid=id.itemid and id.parent_itemid=183705"
13542:20231010:083155.076 slow query: 4.471124 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=0 and nextcheck<=1696926713 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
13543:20231010:083155.312 slow query: 4.481551 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=1 and nextcheck<=1696926713 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
14140:20231010:083156.689 slow query: 3.049810 sec, "select t.taskid,t.type,t.clock,t.ttl,c.command_type,c.exe cute_on,c.port,c.authtype,c.username,c.password,c. publickey,c.privatekey,c.command,c.alertid,c.paren t_taskid,c.hostid,cn.itemid,d.data,d.parent_taskid ,d.type from task t left join task_remote_command c on t.taskid=c.taskid left join task_check_now cn on t.taskid=cn.taskid left join task_data d on t.taskid=d.taskid where t.status=1 and t.proxy_hostid=14237 and (t.ttl=0 or t.clock+t.ttl>1696926713) order by t.taskid"
13499:20231010:083157.476 slow query: 3.528084 sec, "begin;"
14211:20231010:083157.521 failed to accept an incoming connection: from -------- : unspecified certificate verification error: TLS handshake set result code to 5:
13535:20231010:083158.886 slow query: 5.219143 sec, "begin;"
13524:20231010:083158.895 slow query: 4.498533 sec, "commit;"
13546:20231010:083159.251 slow query: 5.569650 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=4 and nextcheck<=1696926716 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
13534:20231010:083159.762 slow query: 4.433611 sec, "begin;"
14119:20231010:083202.509 slow query: 19.266609 sec, "select t.itemid,t.type,t.snmp_oid,t.hostid,t.key_,t.delay ,t.history,t.status,t.value_type,t.trapper_hosts,t .logtimefmt,t.params,t.ipmi_sensor,t.authtype,t.us ername,t.password,t.publickey,t.privatekey,t.flags ,t.interfaceid,t.inventory_link,t.jmx_endpoint,t.m aster_itemid,t.timeout,t.url,t.query_fields,t.post s,t.status_codes,t.follow_redirects,t.post_type,t. http_proxy,t.headers,t.retrieve_mode,t.request_met hod,t.output_format,t.ssl_cert_file,t.ssl_key_file ,t.ssl_key_password,t.verify_peer,t.verify_host,t. allow_traps from items t,hosts r where t.hostid=r.hostid and r.proxy_hostid=10560 and r.status in (0,1) and t.flags<>2 and t.type in (0,7,20,12,2,3,9,10,11,13,14,16,17,5,19,18,21) order by t.itemid"
13498:20231010:083202.512 One child process died (PID:13501,exitcode/signal:9). Exiting ...
13676:20231010:083202.555 slow query: 3.356094 sec, "select t.taskid,t.type,t.clock,t.ttl,c.command_type,c.exe cute_on,c.port,c.authtype,c.username,c.password,c. publickey,c.privatekey,c.command,c.alertid,c.paren t_taskid,c.hostid,cn.itemid,d.data,d.parent_taskid ,d.type from task t left join task_remote_command c on t.taskid=c.taskid left join task_check_now cn on t.taskid=cn.taskid left join task_data d on t.taskid=d.taskid where t.status=1 and t.proxy_hostid=10560 and (t.ttl=0 or t.clock+t.ttl>1696926719) order by t.taskid"
13523:20231010:083202.559 slow query: 3.670100 sec, "begin;"
13928:20231010:083202.596 failed to accept an incoming connection: from ------------ : SSL_accept() timed out
13545:20231010:083202.599 slow query: 3.146755 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is not null and mod(triggerid,5)=3 and nextcheck<=1696926722 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
13543:20231010:083202.713 slow query: 4.892580 sec, "select escalationid,actionid,triggerid,eventid,r_eventid, nextcheck,esc_step,status,itemid,acknowledgeid,ser vicealarmid,serviceid from escalations where triggerid is null and itemid is not null and mod(itemid,5)=1 and nextcheck<=1696926720 order by actionid,triggerid,itemid,r_eventid asc,escalationid"
14154:20231010:083202.749 failed to accept an incoming connection: from ----------- : SSL_accept() timed out
13498:20231010:083202.952 PROCESS EXIT: 13501
13499:20231010:083202.952 HA manager has been paused
zabbix_server [13498]: Error waiting for process with PID 13501: [10] No child processes
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
13498:20231010:083204.486 syncing history data...
13498:20231010:083204.497 [Z3008] query failed due to primary key constraint: [1062] Duplicate entry '3003336-1696926707-415114176' for key 'history.PRIMARY'
13498:20231010:083204.537 skipped 932 duplicates
13498:20231010:083305.009 cannot connect to service manager service: Cannot connect to service "service": [111] Connection refused.
Comment