Brown University Benchmark

MgBench is a new analytical benchmark for machine-generated log data, Andrew Crotty.

Download the data:

  1. wget https://datasets.clickhouse.tech/mgbench{1..3}.csv.xz

Unpack the data:

  1. xz -v -d mgbench{1..3}.csv.xz

Create tables:

  1. CREATE DATABASE mgbench;
  2. CREATE TABLE mgbench.logs1 (
  3. log_time DateTime,
  4. machine_name LowCardinality(String),
  5. machine_group LowCardinality(String),
  6. cpu_idle Nullable(Float32),
  7. cpu_nice Nullable(Float32),
  8. cpu_system Nullable(Float32),
  9. cpu_user Nullable(Float32),
  10. cpu_wio Nullable(Float32),
  11. disk_free Nullable(Float32),
  12. disk_total Nullable(Float32),
  13. part_max_used Nullable(Float32),
  14. load_fifteen Nullable(Float32),
  15. load_five Nullable(Float32),
  16. load_one Nullable(Float32),
  17. mem_buffers Nullable(Float32),
  18. mem_cached Nullable(Float32),
  19. mem_free Nullable(Float32),
  20. mem_shared Nullable(Float32),
  21. swap_free Nullable(Float32),
  22. bytes_in Nullable(Float32),
  23. bytes_out Nullable(Float32)
  24. )
  25. ENGINE = MergeTree()
  26. ORDER BY (machine_group, machine_name, log_time);
  27. CREATE TABLE mgbench.logs2 (
  28. log_time DateTime,
  29. client_ip IPv4,
  30. request String,
  31. status_code UInt16,
  32. object_size UInt64
  33. )
  34. ENGINE = MergeTree()
  35. ORDER BY log_time;
  36. CREATE TABLE mgbench.logs3 (
  37. log_time DateTime64,
  38. device_id FixedString(15),
  39. device_name LowCardinality(String),
  40. device_type LowCardinality(String),
  41. device_floor UInt8,
  42. event_type LowCardinality(String),
  43. event_unit FixedString(1),
  44. event_value Nullable(Float32)
  45. )
  46. ENGINE = MergeTree()
  47. ORDER BY (event_type, log_time);

Insert data:

  1. clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv
  2. clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv
  3. clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv

Run benchmark queries:

  1. -- Q1.1: What is the CPU/network utilization for each web server since midnight?
  2. SELECT machine_name,
  3. MIN(cpu) AS cpu_min,
  4. MAX(cpu) AS cpu_max,
  5. AVG(cpu) AS cpu_avg,
  6. MIN(net_in) AS net_in_min,
  7. MAX(net_in) AS net_in_max,
  8. AVG(net_in) AS net_in_avg,
  9. MIN(net_out) AS net_out_min,
  10. MAX(net_out) AS net_out_max,
  11. AVG(net_out) AS net_out_avg
  12. FROM (
  13. SELECT machine_name,
  14. COALESCE(cpu_user, 0.0) AS cpu,
  15. COALESCE(bytes_in, 0.0) AS net_in,
  16. COALESCE(bytes_out, 0.0) AS net_out
  17. FROM logs1
  18. WHERE machine_name IN ('anansi','aragog','urd')
  19. AND log_time >= TIMESTAMP '2017-01-11 00:00:00'
  20. ) AS r
  21. GROUP BY machine_name;
  22. -- Q1.2: Which computer lab machines have been offline in the past day?
  23. SELECT machine_name,
  24. log_time
  25. FROM logs1
  26. WHERE (machine_name LIKE 'cslab%' OR
  27. machine_name LIKE 'mslab%')
  28. AND load_one IS NULL
  29. AND log_time >= TIMESTAMP '2017-01-10 00:00:00'
  30. ORDER BY machine_name,
  31. log_time;
  32. -- Q1.3: What are the hourly average metrics during the past 10 days for a specific workstation?
  33. SELECT dt,
  34. hr,
  35. AVG(load_fifteen) AS load_fifteen_avg,
  36. AVG(load_five) AS load_five_avg,
  37. AVG(load_one) AS load_one_avg,
  38. AVG(mem_free) AS mem_free_avg,
  39. AVG(swap_free) AS swap_free_avg
  40. FROM (
  41. SELECT CAST(log_time AS DATE) AS dt,
  42. EXTRACT(HOUR FROM log_time) AS hr,
  43. load_fifteen,
  44. load_five,
  45. load_one,
  46. mem_free,
  47. swap_free
  48. FROM logs1
  49. WHERE machine_name = 'babbage'
  50. AND load_fifteen IS NOT NULL
  51. AND load_five IS NOT NULL
  52. AND load_one IS NOT NULL
  53. AND mem_free IS NOT NULL
  54. AND swap_free IS NOT NULL
  55. AND log_time >= TIMESTAMP '2017-01-01 00:00:00'
  56. ) AS r
  57. GROUP BY dt,
  58. hr
  59. ORDER BY dt,
  60. hr;
  61. -- Q1.4: Over 1 month, how often was each server blocked on disk I/O?
  62. SELECT machine_name,
  63. COUNT(*) AS spikes
  64. FROM logs1
  65. WHERE machine_group = 'Servers'
  66. AND cpu_wio > 0.99
  67. AND log_time >= TIMESTAMP '2016-12-01 00:00:00'
  68. AND log_time < TIMESTAMP '2017-01-01 00:00:00'
  69. GROUP BY machine_name
  70. ORDER BY spikes DESC
  71. LIMIT 10;
  72. -- Q1.5: Which externally reachable VMs have run low on memory?
  73. SELECT machine_name,
  74. dt,
  75. MIN(mem_free) AS mem_free_min
  76. FROM (
  77. SELECT machine_name,
  78. CAST(log_time AS DATE) AS dt,
  79. mem_free
  80. FROM logs1
  81. WHERE machine_group = 'DMZ'
  82. AND mem_free IS NOT NULL
  83. ) AS r
  84. GROUP BY machine_name,
  85. dt
  86. HAVING MIN(mem_free) < 10000
  87. ORDER BY machine_name,
  88. dt;
  89. -- Q1.6: What is the total hourly network traffic across all file servers?
  90. SELECT dt,
  91. hr,
  92. SUM(net_in) AS net_in_sum,
  93. SUM(net_out) AS net_out_sum,
  94. SUM(net_in) + SUM(net_out) AS both_sum
  95. FROM (
  96. SELECT CAST(log_time AS DATE) AS dt,
  97. EXTRACT(HOUR FROM log_time) AS hr,
  98. COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in,
  99. COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out
  100. FROM logs1
  101. WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon',
  102. 'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey',
  103. 'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps',
  104. 'poprocks','razzles','runts','smarties','smuggler','spree','stride',
  105. 'tootsie','trident','wrigley','york')
  106. ) AS r
  107. GROUP BY dt,
  108. hr
  109. ORDER BY both_sum DESC
  110. LIMIT 10;
  111. -- Q2.1: Which requests have caused server errors within the past 2 weeks?
  112. SELECT *
  113. FROM logs2
  114. WHERE status_code >= 500
  115. AND log_time >= TIMESTAMP '2012-12-18 00:00:00'
  116. ORDER BY log_time;
  117. -- Q2.2: During a specific 2-week period, was the user password file leaked?
  118. SELECT *
  119. FROM logs2
  120. WHERE status_code >= 200
  121. AND status_code < 300
  122. AND request LIKE '%/etc/passwd%'
  123. AND log_time >= TIMESTAMP '2012-05-06 00:00:00'
  124. AND log_time < TIMESTAMP '2012-05-20 00:00:00';
  125. -- Q2.3: What was the average path depth for top-level requests in the past month?
  126. SELECT top_level,
  127. AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg
  128. FROM (
  129. SELECT SUBSTRING(request FROM 1 FOR len) AS top_level,
  130. request
  131. FROM (
  132. SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len,
  133. request
  134. FROM logs2
  135. WHERE status_code >= 200
  136. AND status_code < 300
  137. AND log_time >= TIMESTAMP '2012-12-01 00:00:00'
  138. ) AS r
  139. WHERE len > 0
  140. ) AS s
  141. WHERE top_level IN ('/about','/courses','/degrees','/events',
  142. '/grad','/industry','/news','/people',
  143. '/publications','/research','/teaching','/ugrad')
  144. GROUP BY top_level
  145. ORDER BY top_level;
  146. -- Q2.4: During the last 3 months, which clients have made an excessive number of requests?
  147. SELECT client_ip,
  148. COUNT(*) AS num_requests
  149. FROM logs2
  150. WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00'
  151. GROUP BY client_ip
  152. HAVING COUNT(*) >= 100000
  153. ORDER BY num_requests DESC;
  154. -- Q2.5: What are the daily unique visitors?
  155. SELECT dt,
  156. COUNT(DISTINCT client_ip)
  157. FROM (
  158. SELECT CAST(log_time AS DATE) AS dt,
  159. client_ip
  160. FROM logs2
  161. ) AS r
  162. GROUP BY dt
  163. ORDER BY dt;
  164. -- Q2.6: What are the average and maximum data transfer rates (Gbps)?
  165. SELECT AVG(transfer) / 125000000.0 AS transfer_avg,
  166. MAX(transfer) / 125000000.0 AS transfer_max
  167. FROM (
  168. SELECT log_time,
  169. SUM(object_size) AS transfer
  170. FROM logs2
  171. GROUP BY log_time
  172. ) AS r;
  173. -- Q3.1: Did the indoor temperature reach freezing over the weekend?
  174. SELECT *
  175. FROM logs3
  176. WHERE event_type = 'temperature'
  177. AND event_value <= 32.0
  178. AND log_time >= '2019-11-29 17:00:00.000';
  179. -- Q3.4: Over the past 6 months, how frequently were each door opened?
  180. SELECT device_name,
  181. device_floor,
  182. COUNT(*) AS ct
  183. FROM logs3
  184. WHERE event_type = 'door_open'
  185. AND log_time >= '2019-06-01 00:00:00.000'
  186. GROUP BY device_name,
  187. device_floor
  188. ORDER BY ct DESC;
  189. -- Q3.5: Where in the building do large temperature variations occur in winter and summer?
  190. WITH temperature AS (
  191. SELECT dt,
  192. device_name,
  193. device_type,
  194. device_floor
  195. FROM (
  196. SELECT dt,
  197. hr,
  198. device_name,
  199. device_type,
  200. device_floor,
  201. AVG(event_value) AS temperature_hourly_avg
  202. FROM (
  203. SELECT CAST(log_time AS DATE) AS dt,
  204. EXTRACT(HOUR FROM log_time) AS hr,
  205. device_name,
  206. device_type,
  207. device_floor,
  208. event_value
  209. FROM logs3
  210. WHERE event_type = 'temperature'
  211. ) AS r
  212. GROUP BY dt,
  213. hr,
  214. device_name,
  215. device_type,
  216. device_floor
  217. ) AS s
  218. GROUP BY dt,
  219. device_name,
  220. device_type,
  221. device_floor
  222. HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0
  223. )
  224. SELECT DISTINCT device_name,
  225. device_type,
  226. device_floor,
  227. 'WINTER'
  228. FROM temperature
  229. WHERE dt >= DATE '2018-12-01'
  230. AND dt < DATE '2019-03-01'
  231. UNION
  232. SELECT DISTINCT device_name,
  233. device_type,
  234. device_floor,
  235. 'SUMMER'
  236. FROM temperature
  237. WHERE dt >= DATE '2019-06-01'
  238. AND dt < DATE '2019-09-01';
  239. -- Q3.6: For each device category, what are the monthly power consumption metrics?
  240. SELECT yr,
  241. mo,
  242. SUM(coffee_hourly_avg) AS coffee_monthly_sum,
  243. AVG(coffee_hourly_avg) AS coffee_monthly_avg,
  244. SUM(printer_hourly_avg) AS printer_monthly_sum,
  245. AVG(printer_hourly_avg) AS printer_monthly_avg,
  246. SUM(projector_hourly_avg) AS projector_monthly_sum,
  247. AVG(projector_hourly_avg) AS projector_monthly_avg,
  248. SUM(vending_hourly_avg) AS vending_monthly_sum,
  249. AVG(vending_hourly_avg) AS vending_monthly_avg
  250. FROM (
  251. SELECT dt,
  252. yr,
  253. mo,
  254. hr,
  255. AVG(coffee) AS coffee_hourly_avg,
  256. AVG(printer) AS printer_hourly_avg,
  257. AVG(projector) AS projector_hourly_avg,
  258. AVG(vending) AS vending_hourly_avg
  259. FROM (
  260. SELECT CAST(log_time AS DATE) AS dt,
  261. EXTRACT(YEAR FROM log_time) AS yr,
  262. EXTRACT(MONTH FROM log_time) AS mo,
  263. EXTRACT(HOUR FROM log_time) AS hr,
  264. CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee,
  265. CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer,
  266. CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector,
  267. CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending
  268. FROM logs3
  269. WHERE device_type = 'meter'
  270. ) AS r
  271. GROUP BY dt,
  272. yr,
  273. mo,
  274. hr
  275. ) AS s
  276. GROUP BY yr,
  277. mo
  278. ORDER BY yr,
  279. mo;

The data is also available for interactive queries in the Playground, example.

Original article