Updated
I've got no explanation for you, since I'm on Haswell, but I do have code to share that might help you or someone else with Broadwell or Skylake hardware isolate your problem. If you could please run it on your machine and share the results, we could gain an insight into what's happening to your machine.
Intro
Recent Intel Core i7 processors have 7 performance monitor counters (PMCs), 3 fixed-function and 4 general-purpose, that may be used to profile code. The fixed-function PMCs are:
- Instructions retired
- Unhalted core cycles (Clock ticks including the effects of TurboBoost)
- Unhalted Reference cycles (Fixed-frequency clock ticks)
The ratio of core:reference clock cycles determines the relative speedup or slowdown from dynamic frequency scaling.
Although software exists (see comments below) that accesses these counters, I did not know them and still find them to be insufficiently fine-grained.
I therefore wrote myself a Linux kernel module, perfcount
, over the past few days to grant me access to the Intel performance counter monitors, and a userspace testbench and library for your code that wraps your FMA code around calls to my LKM. Instructions for how to reproduce my setup will follow.
My testbench source code is below. It warms up, then runs your code several times, testing it over a long list of metrics. I changed your loop count to 1 billion. Because only 4 general-purpose PMCs can be programmed at once, I do the measurements 4 at a time.
perfcountdemo.c
/* Includes */
#include "libperfcount.h"
#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Function prototypes */
void code1(void);
void code2(void);
void code3(void);
void code4(void);
void code5(void);
/* Global variables */
void ((*FN_TABLE[])(void)) = {
code1,
code2,
code3,
code4,
code5
};
/**
* Code snippets to bench
*/
void code1(void){
asm volatile(
".intel_syntax noprefix
"
"vzeroall
"
"mov rcx, 1000000000
"
"LstartLabel1:
"
"vfmadd231ps %%ymm0, %%ymm0, %%ymm0
"
"vfmadd231ps ymm1, ymm1, ymm1
"
"vfmadd231ps ymm2, ymm2, ymm2
"
"vfmadd231ps ymm3, ymm3, ymm3
"
"vfmadd231ps ymm4, ymm4, ymm4
"
"vfmadd231ps ymm5, ymm5, ymm5
"
"vfmadd231ps ymm6, ymm6, ymm6
"
"vfmadd231ps ymm7, ymm7, ymm7
"
"vfmadd231ps ymm8, ymm8, ymm8
"
"vfmadd231ps ymm9, ymm9, ymm9
"
"vpaddd ymm10, ymm10, ymm10
"
"vpaddd ymm11, ymm11, ymm11
"
"vpaddd ymm12, ymm12, ymm12
"
"vpaddd ymm13, ymm13, ymm13
"
"vpaddd ymm14, ymm14, ymm14
"
"dec rcx
"
"jnz LstartLabel1
"
".att_syntax noprefix
"
: /* No outputs we care about */
: /* No inputs we care about */
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
"rcx",
"memory"
);
}
void code2(void){
}
void code3(void){
}
void code4(void){
}
void code5(void){
}
/* Test Schedule */
const char* const SCHEDULE[] = {
/* Batch */
"uops_issued.any",
"uops_issued.any<1",
"uops_issued.any>=1",
"uops_issued.any>=2",
/* Batch */
"uops_issued.any>=3",
"uops_issued.any>=4",
"uops_issued.any>=5",
"uops_issued.any>=6",
/* Batch */
"uops_executed_port.port_0",
"uops_executed_port.port_1",
"uops_executed_port.port_2",
"uops_executed_port.port_3",
/* Batch */
"uops_executed_port.port_4",
"uops_executed_port.port_5",
"uops_executed_port.port_6",
"uops_executed_port.port_7",
/* Batch */
"resource_stalls.any",
"resource_stalls.rs",
"resource_stalls.sb",
"resource_stalls.rob",
/* Batch */
"uops_retired.all",
"uops_retired.all<1",
"uops_retired.all>=1",
"uops_retired.all>=2",
/* Batch */
"uops_retired.all>=3",
"uops_retired.all>=4",
"uops_retired.all>=5",
"uops_retired.all>=6",
/* Batch */
"inst_retired.any_p",
"inst_retired.any_p<1",
"inst_retired.any_p>=1",
"inst_retired.any_p>=2",
/* Batch */
"inst_retired.any_p>=3",
"inst_retired.any_p>=4",
"inst_retired.any_p>=5",
"inst_retired.any_p>=6",
/* Batch */
"idq_uops_not_delivered.core",
"idq_uops_not_delivered.core<1",
"idq_uops_not_delivered.core>=1",
"idq_uops_not_delivered.core>=2",
/* Batch */
"idq_uops_not_delivered.core>=3",
"idq_uops_not_delivered.core>=4",
"rs_events.empty",
"idq.empty",
/* Batch */
"idq.mite_all_uops",
"idq.mite_all_uops<1",
"idq.mite_all_uops>=1",
"idq.mite_all_uops>=2",
/* Batch */
"idq.mite_all_uops>=3",
"idq.mite_all_uops>=4",
"move_elimination.int_not_eliminated",
"move_elimination.simd_not_eliminated",
/* Batch */
"lsd.uops",
"lsd.uops<1",
"lsd.uops>=1",
"lsd.uops>=2",
/* Batch */
"lsd.uops>=3",
"lsd.uops>=4",
"ild_stall.lcp",
"ild_stall.iq_full",
/* Batch */
"br_inst_exec.all_branches",
"br_inst_exec.0x81",
"br_inst_exec.0x82",
"icache.misses",
/* Batch */
"br_misp_exec.all_branches",
"br_misp_exec.0x81",
"br_misp_exec.0x82",
"fp_assist.any",
/* Batch */
"cpu_clk_unhalted.core_clk",
"cpu_clk_unhalted.ref_xclk",
"baclears.any"
};
const int NUMCOUNTS = sizeof(SCHEDULE)/sizeof(*SCHEDULE);
/**
* Main
*/
int main(int argc, char* argv[]){
int i;
/**
* Initialize
*/
pfcInit();
if(argc <= 1){
pfcDumpEvents();
exit(1);
}
pfcPinThread(3);
/**
* Arguments are:
*
* perfcountdemo #codesnippet
*
* There is a schedule of configuration that is followed.
*/
void (*fn)(void) = FN_TABLE[strtoull(argv[1], NULL, 0)];
static const uint64_t ZERO_CNT[7] = {0,0,0,0,0,0,0};
static const uint64_t ZERO_CFG[7] = {0,0,0,0,0,0,0};
uint64_t cnt[7] = {0,0,0,0,0,0,0};
uint64_t cfg[7] = {2,2,2,0,0,0,0};
/* Warmup */
for(i=0;i<10;i++){
fn();
}
/* Run master loop */
for(i=0;i<NUMCOUNTS;i+=4){
/* Configure counters */
const char* sched0 = i+0 < NUMCOUNTS ? SCHEDULE[i+0] : "";
const char* sched1 = i+1 < NUMCOUNTS ? SCHEDULE[i+1] : "";
const char* sched2 = i+2 < NUMCOUNTS ? SCHEDULE[i+2] : "";
const char* sched3 = i+3 < NUMCOUNTS ? SCHEDULE[i+3] : "";
cfg[3] = pfcParseConfig(sched0);
cfg[4] = pfcParseConfig(sched1);
cfg[5] = pfcParseConfig(sched2);
cfg[6] = pfcParseConfig(sched3);
pfcWrConfigCnts(0, 7, cfg);
pfcWrCountsCnts(0, 7, ZERO_CNT);
pfcRdCountsCnts(0, 7, cnt);
/* ^ Should report 0s, and launch the counters. */
/************** Hot section **************/
fn();
/************ End Hot section ************/
pfcRdCountsCnts(0, 7, cnt);
pfcWrConfigCnts(0, 7, ZERO_CFG);
/* ^ Should clear the counter config and disable them. */
/**
* Print the lovely results
*/
printf("Instructions Issued : %20llu
", cnt[0]);
printf("Unhalted core cycles : %20llu
", cnt[1]);
printf("Unhalted reference cycles : %20llu
", cnt[2]);
printf("%-35s: %20llu
", sched0, cnt[3]);
printf("%-35s: %20llu
", sched1, cnt[4]);
printf("%-35s: %20llu
", sched2, cnt[5]);
printf("%-35s: %20llu
", sched3, cnt[6]);
}
/**
* Close up shop
*/
pfcFini();
}
On my machine, I got the following results:
Haswell Core i7-4700MQ
> ./perfcountdemo 0
Instructions Issued : 17000001807
Unhalted core cycles : 5305920785
Unhalted reference cycles : 4245764952
uops_issued.any : 16000811079
uops_issued.any<1 : 1311417889
uops_issued.any>=1 : 4000292290
uops_issued.any>=2 : 4000229358
Instructions Issued : 17000001806
Unhalted core cycles : 5303822082
Unhalted reference cycles : 4243345896
uops_issued.any>=3 : 4000156998
uops_issued.any>=4 : 4000110067
uops_issued.any>=5 : 0
uops_issued.any>=6 : 0
Instructions Issued : 17000001811
Unhalted core cycles : 5314227923
Unhalted reference cycles : 4252020624
uops_executed_port.port_0 : 5016261477
uops_executed_port.port_1 : 5036728509
uops_executed_port.port_2 : 5282
uops_executed_port.port_3 : 12481
Instructions Issued : 17000001816
Unhalted core cycles : 5329351248
Unhalted reference cycles : 4265809728
uops_executed_port.port_4 : 7087
uops_executed_port.port_5 : 4946019835
uops_executed_port.port_6 : 1000228324
uops_executed_port.port_7 : 1372
Instructions Issued : 17000001816
Unhalted core cycles : 5325153463
Unhalted reference cycles : 4261060248
resource_stalls.any : 1322734589
resource_stalls.rs : 844250210
resource_stalls.sb : 0
resource_stalls.rob : 0
Instructions Issued : 17000001814
Unhalted core cycles : 5327823817
Unhalted reference cycles : 4262914728
uops_retired.all : 16000445793
uops_retired.all<1 : 687284798
uops_retired.all>=1 : 4646263984
uops_retired.all>=2 : 4452324050
Instructions Issued : 17000001809
Unhalted core cycles : 5311736558
Unhalted reference cycles : 4250015688
uops_retired.all>=3 : 3545695253
uops_retired.all>=4 : 3341664653
uops_retired.all>=5 : 1016
uops_retired.all>=6 : 1
Instructions Issued : 17000001871
Unhalted core cycles : 5477215269
Unhalted reference cycles : 4383891984
inst_retired.any_p : 17000001871
inst_retired.any_p<1 : 891904306
inst_retired.any_p>=1 : 4593972062
inst_retired.any_p>=2 : 4441024510
Instructions Issued : 17000001835
Unhalted core cycles : 5377202052
Unhalted reference cycles :