Monitoring Availability Groups Part 1 – Perfmon Counters

This entry is part 1 of 3 in the series Monitoring Availability Groups

Monitoring Availability Groups can be tricky.  The DMVs don’t update the log_send_rate and redo_rate all the time especially if nothing is happening so if you try to use those for calculations when monitoring you could false results worse yet pages in the middle of the night.  In order to calculate the log_send_rate and redo_rate you need to capture the perfmon counters ‘Log Bytes Flushed/sec’, ‘Redone Bytes/sec’, and ‘Log Bytes Received/sec’ into temporary tables WAITFOR 1 second then capture them again.  Below is query that captures this along side what you see in the DMV for when Microsoft might fix the issue.

SELECT instance_name, 
	cntr_value 
INTO #Logflushes1
FROM sys.dm_os_performance_counters	
WHERE [object_name] = 'SQLServer:Databases' 
	AND counter_name = 'Log Bytes Flushed/sec'

SELECT instance_name, 
	cntr_value 
INTO #redo1
FROM sys.dm_os_performance_counters
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Redone Bytes/sec'

SELECT instance_name, 
	cntr_value 
INTO #send1
FROM sys.dm_os_performance_counters 
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Log Bytes Received/sec'

WAITFOR DELAY '00:00:01'

SELECT instance_name, 
	cntr_value 
INTO #Logflushes2
FROM sys.dm_os_performance_counters	
WHERE [object_name] = 'SQLServer:Databases' 
	AND counter_name = 'Log Bytes Flushed/sec'

SELECT instance_name, 
	cntr_value 
INTO #redo2
FROM sys.dm_os_performance_counters
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Redone Bytes/sec'

SELECT instance_name, 
	cntr_value 
INTO #send2
FROM sys.dm_os_performance_counters 
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Log Bytes Received/sec'

SELECT  r.replica_server_name
      , DB_NAME(rs.database_id) AS [DatabaseName]
	  , rs.is_primary_replica
	  ,synchronization_health_desc
	  , CASE WHEN rs.is_primary_replica = 1 THEN (CONVERT(DECIMAL(10,2), log_flushes / 1024.0)) ELSE null END  [Log KB/sec]
      , rs.log_send_queue_size
      , rs.log_send_rate [log_send_rate - dmv]
	  , send_rate / 1024.0 [log_send_rate KB - perfmon]
	  , CASE WHEN rs.is_local != 1 THEN NULL ELSE (CONVERT(DECIMAL(10,2), log_send_queue_size / CASE WHEN send_rate = 0 THEN 1 ELSE send_rate / 1024.0 END)) END [send_latency - sec] --Limit to two decimals, queue is KB, convert @send_rate to KB
      , rs.redo_queue_size
      , rs.redo_rate [redo_rate - dmv]
	  , redo_rate.redo_rate / 1024.0 [redo_rate KB - perfmon]
	  , CASE WHEN rs.is_local != 1 THEN NULL ELSE (CONVERT(DECIMAL(10,2), rs.redo_queue_size / CASE WHEN redo_rate.redo_rate = 0 THEN 1 ELSE redo_rate.redo_rate / 1024.0 END)) END [redo_latency - sec] --Limit to two decimals, queue is KB, convert @redo_rate to KB
FROM    sys.dm_hadr_database_replica_states rs
        JOIN sys.availability_replicas r ON r.group_id = rs.group_id
                                            AND r.replica_id = rs.replica_id
		INNER JOIN (SELECT l1.instance_name, l2.cntr_value - l1.cntr_value log_flushes
			FROM #Logflushes1 l1
			INNER JOIN #Logflushes2 l2 ON l2.instance_name = l1.instance_name) log_flushes ON log_flushes.instance_name = DB_NAME(rs.database_id)
		INNER JOIN (SELECT l1.instance_name, l2.cntr_value - l1.cntr_value redo_rate
			FROM #redo1 l1
				INNER JOIN #redo2 l2 ON l2.instance_name = l1.instance_name) redo_rate ON redo_rate.instance_name = DB_NAME(rs.database_id)
		INNER JOIN (SELECT l1.instance_name, l2.cntr_value - l1.cntr_value send_rate
			FROM #send1 l1
				INNER JOIN #send2 l2 ON l2.instance_name = l1.instance_name) send_rate ON send_rate.instance_name = DB_NAME(rs.database_id)
ORDER BY r.replica_server_name 

DROP TABLE #Logflushes1
DROP TABLE #Logflushes2
DROP TABLE #send1
DROP TABLE #send2
DROP TABLE #redo1
DROP TABLE #redo2

If you use Get-CMSHost function from my Running SQL Scripts Against Multiple Servers Using PowerShell post you can run this against a group of servers and check the status of all your Availability Groups at once.

Get-CmsHosts -InstanceList 'c:\temp\servers.txt' | % { New-PSSession -ComputerName $_ | out-null}
$sessions = Get-PSSession
 
$scriptblock = {
$query = @"

SELECT instance_name, 
	cntr_value 
INTO #Logflushes1
FROM sys.dm_os_performance_counters	
WHERE [object_name] = 'SQLServer:Databases' 
	AND counter_name = 'Log Bytes Flushed/sec'

SELECT instance_name, 
	cntr_value 
INTO #redo1
FROM sys.dm_os_performance_counters
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Redone Bytes/sec'

SELECT instance_name, 
	cntr_value 
INTO #send1
FROM sys.dm_os_performance_counters 
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Log Bytes Received/sec'

	
WAITFOR DELAY '00:00:01'

SELECT instance_name, 
	cntr_value 
INTO #Logflushes2
FROM sys.dm_os_performance_counters	
WHERE [object_name] = 'SQLServer:Databases' 
	AND counter_name = 'Log Bytes Flushed/sec'

SELECT instance_name, 
	cntr_value 
INTO #redo2
FROM sys.dm_os_performance_counters
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Redone Bytes/sec'

SELECT instance_name, 
	cntr_value 
INTO #send2
FROM sys.dm_os_performance_counters 
WHERE [object_name] = 'SQLServer:Database Replica' 
	AND counter_name = 'Log Bytes Received/sec'

SELECT  r.replica_server_name
      , DB_NAME(rs.database_id) AS [DatabaseName]
	  , rs.is_primary_replica
	  ,synchronization_health_desc
	  , CASE WHEN rs.is_primary_replica = 1 THEN (CONVERT(DECIMAL(10,2), log_flushes / 1024.0)) ELSE null END  [Log KB/sec]
      , rs.log_send_queue_size
      , rs.log_send_rate [log_send_rate - dmv]
	  , send_rate / 1024.0 [log_send_rate KB - perfmon]
	  , CASE WHEN rs.is_local != 1 THEN NULL ELSE (CONVERT(DECIMAL(10,2), log_send_queue_size / CASE WHEN send_rate = 0 THEN 1 ELSE send_rate / 1024.0 END)) END [send_latency - sec] --Limit to two decimals, queue is KB, convert @send_rate to KB
      , rs.redo_queue_size
      , rs.redo_rate [redo_rate - dmv]
	  , redo_rate.redo_rate / 1024.0 [redo_rate KB - perfmon]
	  , CASE WHEN rs.is_local != 1 THEN NULL ELSE (CONVERT(DECIMAL(10,2), rs.redo_queue_size / CASE WHEN redo_rate.redo_rate = 0 THEN 1 ELSE redo_rate.redo_rate / 1024.0 END)) END [redo_latency - sec] --Limit to two decimals, queue is KB, convert @redo_rate to KB
FROM    sys.dm_hadr_database_replica_states rs
        JOIN sys.availability_replicas r ON r.group_id = rs.group_id
                                            AND r.replica_id = rs.replica_id
		INNER JOIN (SELECT l1.instance_name, l2.cntr_value - l1.cntr_value log_flushes
			FROM #Logflushes1 l1
			INNER JOIN #Logflushes2 l2 ON l2.instance_name = l1.instance_name) log_flushes ON log_flushes.instance_name = DB_NAME(rs.database_id)
		INNER JOIN (SELECT l1.instance_name, l2.cntr_value - l1.cntr_value redo_rate
			FROM #redo1 l1
				INNER JOIN #redo2 l2 ON l2.instance_name = l1.instance_name) redo_rate ON redo_rate.instance_name = DB_NAME(rs.database_id)
		INNER JOIN (SELECT l1.instance_name, l2.cntr_value - l1.cntr_value send_rate
			FROM #send1 l1
				INNER JOIN #send2 l2 ON l2.instance_name = l1.instance_name) send_rate ON send_rate.instance_name = DB_NAME(rs.database_id)
ORDER BY r.replica_server_name 

DROP TABLE #Logflushes1
DROP TABLE #Logflushes2
DROP TABLE #send1
DROP TABLE #send2
DROP TABLE #redo1
DROP TABLE #redo2
"@
Invoke-Sqlcmd -Query $query
}
 
Invoke-Command -Session $($sessions | ? { $_.State -eq 'Opened' }) -ScriptBlock $scriptblock | Select * -ExcludeProperty RunspaceId | Out-GridView
$sessions | Remove-PSSession
Series NavigationMonitoring Availability Groups Part 2 – SQL Agent Alerts >>

Related Posts

2 thoughts on “Monitoring Availability Groups Part 1 – Perfmon Counters

  1. Hi Tracy,
    Thanks for the scripts.
    I just wanted to let you know that on my instance (13.0.4411.0), counters [object_name] have this format > MSSQL$:Database Replica.
    And also that is_primary_replica column is not available for 11.0.5058.0.
    Julien

Leave a Reply to JulienCancel reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.