{"id":377,"date":"2024-03-06T11:42:51","date_gmt":"2024-03-06T03:42:51","guid":{"rendered":"https:\/\/likesite.win\/?p=377"},"modified":"2024-04-19T13:17:24","modified_gmt":"2024-04-19T05:17:24","slug":"vtune-2","status":"publish","type":"post","link":"https:\/\/189505.xyz\/?p=377","title":{"rendered":"vtune"},"content":{"rendered":"<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_40 counter-hierarchy ez-toc-counter ez-toc-grey ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\">Table of Contents<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" area-label=\"ez-toc-toggle-icon-1\"><label for=\"item-6a5ff974b9ee4\" aria-label=\"Table of Content\"><span style=\"display: flex;align-items: center;width: 35px;height: 30px;justify-content: center;direction:ltr;\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/label><input  type=\"checkbox\" id=\"item-6a5ff974b9ee4\"><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/189505.xyz\/?p=377\/#collect\" title=\"\ncollect \n\">\ncollect \n<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/189505.xyz\/?p=377\/#memory-access\" title=\"\nmemory-access \n\">\nmemory-access \n<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/189505.xyz\/?p=377\/#hotspots\" title=\"\nhotspots \n  \">\nhotspots \n  <\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-1'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/189505.xyz\/?p=377\/#%E5%BF%AB%E9%80%9F%E5%91%BD%E4%BB%A4\" title=\"\n  \u5feb\u901f\u547d\u4ee4 \n  \">\n  \u5feb\u901f\u547d\u4ee4 \n  <\/a><ul class='ez-toc-list-level-2'><li class='ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/189505.xyz\/?p=377\/#memory-access-2\" title=\"\n  memory-access \n  \">\n  memory-access \n  <\/a><\/li><\/ul><\/li><\/ul><\/nav><\/div>\n<h1><span class=\"ez-toc-section\" id=\"collect\"><\/span>\ncollect<br \/>\n<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<pre><code># \/opt\/intel\/oneapi\/vtune\/latest\/bin64\/vtune -help collect\nIntel(R) VTune(TM) Profiler Command Line Tool\nCopyright (C) 2009 Intel Corporation. All rights reserved.\n\n-c, -collect=&lt;string&gt;         Choose an analysis type.\n\n Perform a data collection of the specified analysis type.\n See the list of available analysis types below.\n\nAction Options:\n\n-allow-multiple-runs | -no-allow-multiple-runs (default)\n                              Enable multiple runs to achieve more precise\n                              results for hardware event-based collections.\n                              When disabled, the collector multiplexes events\n                              running a single collection, which lowers result\n                              precision.\n-analyze-kvm-guest | -no-analyze-kvm-guest (default)\n                              Enable to analyze KVM guest OS running on the\n                              system. This option is applicable to hardware\n                              event-based analysis types only.\n-analyze-system | -no-analyze-system (default)\n                              Enable to analyze all processes running on the\n                              system. When disabled, only the attached process\n                              and its children are analyzed. This option is\n                              applicable to hardware event-based analysis types\n                              only.\n-app-working-dir=&lt;string&gt;     Specify a directory where the application will be\n                              run.\n-auto-finalize | -no-auto-finalize\n                              The option is deprecated. Please use\n                              -finalization-mode=none instead. Turn on\/off\n                              automatic result finalization after data\n                              collection\/import. --no-auto-finalize option also\n                              turns off the summary report (--no-summary).\n-call-stack-mode=user-only | user-plus-one | all\n                              Choose how to show system functions in the stack.\n-cpu-mask=&lt;string&gt;            Specify CPU(s) to collect data on (for example:\n                              2-8,10,12-14). This option is applicable to\n                              hardware event-based analysis types only.\n-custom-collector=&lt;string&gt;    Provide a command line for launching an external\n                              collection tool. You can later import custom\n                              collection data (time intervals and counters) in\n                              a CSV format to the VTune Profiler result.\n-data-limit=&lt;integer&gt; (1000)  Limit the amount of raw data to be collected by\n                              setting the maximum possible result size (in MB).\n                              VTune Profiler starts collecting data from the\n                              beginning of the target execution and ends when\n                              the limit for the result size is reached. For\n                              unlimited data size, specify 0.\n-discard-raw-data | -no-discard-raw-data (default)\n                              Discard raw collector data from the result upon\n                              finalization.\n-d, -duration=&lt;string&gt;        Specify a duration for collection (in seconds).\n                              Required for system-wide collection. Can also be\n                              &#039;unlimited&#039;.\n-finalization-mode=full | fast | deferred | none\n                              Define finalization mode: full - perform full\n                              finalization; fast (default) - reduce loaded\n                              sample count to speed up post-processing;\n                              deferred - calculate only binary checksum for\n                              finalization on another machine; none - skip\n                              finalization.\n-finalization-mode=full | fast | deferred | none (fast)\n                              Finalization may take significant system\n                              resources. For a powerful target system, select\n                              full mode to apply immediately after collection.\n                              Otherwise, shorten finalization or defer it to\n                              run on another system (compute checksums only).\n-follow-child (default) | -no-follow-child\n                              Collect data on processes launched by the target\n                              process (recommended for applications launched by\n                              a script).\n-inline-mode=on | off         Choose to show or hide inline functions in the\n                              stack.\n-k, -knob=&lt;string&gt;            Set knob value for selected analysis type as\n                              -knob knobName=knobValue. For a list of knobs\n                              available for an analysis, enter: -help collect\n                              &lt;analysis_type&gt;.\n-kvm-guest-kallsyms=&lt;string&gt;  Specify a local path to the \/proc\/kallsyms file\n                              copied from the guest OS for proper symbol\n                              resolution.\n-kvm-guest-modules=&lt;string&gt;   Specify a local path to the \/proc\/modules file\n                              copied from the guest OS for proper symbol\n                              resolution.\n-loop-mode=loop-only | loop-and-function | function-only\n                              Choose to show or hide loops in the stack.\n-mrte-mode=auto | native | mixed | managed (auto)\n                              Select a profiling mode. The Native mode does not\n                              attribute data to managed source. The Mixed mode\n                              attributes data to managed source where\n                              appropriate. The Managed mode tries to limit\n                              attribution to managed source when available.\n-r, -result-dir=&lt;string&gt; (r@@@{at})\n                              Specify result directory path. The default name\n                              for a result directory is r@@@{at}, where @@@ is\n                              the incremented number of the result, and {at} is\n                              a two- or three-letter abbreviation for the\n                              analysis type.\n-resume-after=&lt;double&gt;        Specify time (in seconds, with fractions allowed)\n                              to delay data collection after the application\n                              starts. For example, 1.56 is 1 sec 560 msec.\n-return-app-exitcode | -no-return-app-exitcode (default)\n                              Return the target exit code instead of the\n                              command line interface exit code.\n-ring-buffer=&lt;double&gt; (0)     Limit the amount of raw data to be collected by\n                              setting the timer enabling the analysis only for\n                              the last seconds before the target or collection\n                              is terminated. For unlimited data size, specify\n                              0.\n-search-dir=&lt;string&gt;          Specify search directories for binary and symbol\n                              files. When the files are in multiple\n                              directories, use the search-dir option multiple\n                              times so that all the necessary directories are\n                              searched.\n-source-search-dir=&lt;string&gt;   Specify search directories for source files. When\n                              your source files are in multiple directories,\n                              use the source-search-dir option multiple times\n                              so that all the necessary directories are\n                              searched.\n-start-paused                 Start data collection paused.\n-strategy=&lt;string&gt;            Specify details for parent and child processes\n                              analysis.\n                              Format:&lt;process_name1&gt;:&lt;profiling_mode&gt;,&lt;process_\n                              name2&gt;:&lt;profiling_mode&gt;,... Available profiling\n                              mode values are: trace:trace, trace:notrace,\n                              notrace:notrace, notrace:trace. This option is\n                              not applicable to hardware event-based analysis\n                              types.\n-summary (default) | -no-summary\n                              Turn on\/off showing the summary report after data\n                              collection\/import.\n-target-duration-type=veryshort | short | medium | long (short)\n                              Estimate the application duration time. This\n                              value affects the size of collected data. For\n                              long running targets, sampling interval is\n                              increased to reduce the result size. For hardware\n                              event-based analysis types, the duration estimate\n                              affects a multiplier applied to the configured\n                              Sample after value.\n-target-install-dir=&lt;string&gt;  Specify a path to VTune Profiler on the remote\n                              system. If the default location is used, this\n                              path is automatically supplied.\n-target-pid=&lt;unsigned integer&gt;\n                              Attach collection to a running process specified\n                              by process ID.\n-target-ports=&lt;string&gt;        Specify a network port used by the target\n                              collector on the remote system.\n-target-process=&lt;string&gt;      Attach collection to a running process specified\n                              by process name.\n-target-system=&lt;string&gt;       Define target system for remote collection.\n                              Supported &lt;string&gt; values:\n                              android - for Android systems.\n                              ssh:user@target - for Linux systems, where &lt;user&gt;\n                              is a user name and &lt;target&gt; is a network name of\n                              the remote system accessed via SSH (usually IP\n                              address).\n-target-tmp-dir=&lt;string&gt;      Specify a directory on the remote system where\n                              performance results are temporarily stored. By\n                              default, \/tmp directory is used.\n-trace-mpi | -no-trace-mpi (default)\n                              Configure collectors to trace MPI code, and\n                              determine MPI rank IDs in case of a non-Intel MPI\n                              library implementation.\n\nGlobal Options:\n\n-q, -quiet                    Suppress non-essential messages\n-user-data-dir=&lt;string&gt;       Specify the base directory for result paths\n                              provided by --result-dir option. By default, the\n                              current working directory is used.\n-v, -verbose                  Print additional information\n\nExamples:\n\n 1) Perform the hotspots collection on the given target.\n\n    vtune -collect hotspots a.out\n\n The default naming template for result directories is r@@@{at}, where:\n @@@ is an increasing numeric sequence automatically assigned by vtune;\n {at} is an abbreviation of the analysis type.\n\n 2) Collect the results into the &#039;r001tr&#039; result directory.\n\n    vtune -collect threading -r r001tr a.out\n\n Use &#039;-help collect &lt;analysis type&gt;&#039; for more information about each analysis type.\n\nAvailable Analysis Types:\n\n Want to characterize and identify relevant analysis types for your workload?\n\n   performance-snapshot\n      Get a quick snapshot of your application performance and identify next\n      steps for deeper analysis.\n\n Want to find out where your application spends time and optimize your algorithms?\n\n   hotspots\n      Identify the most time consuming functions and lines of source code.\n\n   anomaly-detection\n      Preview feature - should we keep it, change it, or drop it? Send us your\n      comments: mailto:parallel.studio.support@intel.com?subject=VTune\n      Profiler: Anomaly Detection - preview feedback. Identify performance\n      anomalies by profiling critical code at the microsecond level. Anomaly\n      Detection uses Intel Processor Trace technology for fine-grained\n      analysis.\n\n   memory-consumption\n      Analyze memory consumption by your application, its distinct memory\n      objects and their allocation stacks.\n\n Want to see how efficiently your code is using the underlying hardware?\n\n   uarch-exploration\n      Analyze CPU microarchitecture bottlenecks affecting the performance of\n      your application.\n\n   memory-access\n      Measure a set of metrics to identify memory access related issues.\n\n Want to assess the compute efficiency of your multi-threaded application?\n\n   threading\n      Discover how well your application is using parallelism to take advantage\n      of all available CPU cores.\n\n   hpc-performance\n      Analyze performance aspects of compute-intensive applications, including\n      CPU and GPU utilization. Get information on OpenMP efficiency, memory\n      access, and vectorization.\n\n Want to see how efficiently your code is using I\/O?\n\n   io\n      Analyze utilization of IO subsystems, CPU, and processor buses.\n\n Want to explore GPU\/FPGA usage for your application?\n\n   gpu-offload\n      Explore code execution on various CPU and GPU cores on your platform,\n      estimate how your code benefits from offloading to the GPU, and identify\n      whether your application is CPU or GPU bound.\n\n   gpu-hotspots\n      Analyze the most time-consuming GPU kernels, characterize GPU utilization\n      based on GPU hardware metrics, identify performance issues caused by\n      memory latency or inefficient kernel algorithms, and analyze GPU\n      instruction frequency per certain instruction types.\n\n   fpga-interaction\n      Analyze CPU\/FPGA interaction issues through these ways: 1. Focus on the\n      kernels running on the FPGA. 2. Identify the most time-consuming kernels.\n      3. Look at the corresponding metrics on the device side (like Occupancy\n      or Stalls). 4. Correlate with CPU and platform profiling data.\n\n Want to explore CPU, GPU and power usage for your application\/system?\n\n   system-overview\n      Analyze general behavior of Linux or Android target system and correlate\n      power and performance metrics with IRQ handling.\n\n   graphics-rendering\n      Preview feature. Analyze the CPU\/GPU utilization of your code running on\n      the Xen virtualization platform. Explore GPU utilization per GPU engine\n      and GPU hardware metrics that help understand where performance\n      improvements are possible.\n\n   platform-profiler\n      Platform Profiler collects coarse-grained, system-level metrics for\n      extended profiling of minutes to hours. Software architects can identify\n      workloads or phases of workloads that use hardware inefficiently and need\n      tuning. Infrastructure architects can see if the current hardware\n      configuration is a good match for most workloads.\n\n Want to profile applications using Intel Transactional Synchronization Extensions or run on systems with Intel Software Guard Extensions?\n\n   tsx-exploration\n      Analyze Intel Transactional Synchronization Extensions (Intel TSX) usage.\n\n   tsx-hotspots\n      Analyze hotspots inside transactions for systems with the Intel\n      Transactional Synchronization Extensions (Intel TSX) feature enabled.\n\n   sgx-hotspots\n      Analyze hotspots inside security enclaves for systems with the Intel\n      Software Guard Extensions (Intel SGX) feature enabled.<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"memory-access\"><\/span>\nmemory-access<br \/>\n<span class=\"ez-toc-section-end\"><\/span><\/h1>\n<pre><code># \/opt\/intel\/oneapi\/vtune\/latest\/bin64\/vtune -help collect memory-access\nIntel(R) VTune(TM) Profiler Command Line Tool\nCopyright (C) 2009 Intel Corporation. All rights reserved.\n\n Measure a set of metrics to identify memory access related issues (for\n example, specific for NUMA architectures). This analysis type is based on\n the hardware event-based sampling collection.\n\n To modify the analysis type, use the configuration options (knobs) as\n follows:\n -collect memory-access -knob <knobName>=<knobValue>\n Multiple -knob options are allowed and can be followed by additional collect\n action options, as well as global options, if needed.\n\nsampling-interval\n\n  Specify an interval (in milliseconds) between CPU samples.\n\n  Default value: 5\n  Possible values: numbers between 0.01 and 1000\n\nanalyze-mem-objects\n\n  Enable the instrumentation of dynamic memory allocation\/de-allocation and\n  map hardware events to such memory objects. This option may cause\n  additional runtime overhead due to the instrumentation of all system memory\n  allocation\/de-allocation API.\n\n  Default value: false\n  Possible values: true false\n\nmem-object-size-min-thres\n\n  Specify a minimal size of dynamic memory allocations to analyze. This\n  option helps reduce runtime overhead of the instrumentation.\n\n  Default value: 1024\n  Possible values: numbers between -2147483648 and 2147483647\n\ndram-bandwidth-limits\n\n  Evaluate maximum achievable local DRAM bandwidth before the collection\n  starts. This data is used to scale bandwidth metrics on the timeline and\n  calculate thresholds.\n\n  Default value: true\n  Possible values: true false\n\nanalyze-openmp\n\n  Instrument and analyze OpenMP regions to detect inefficiencies such as\n  imbalance, lock contention, or overhead on performing scheduling, reduction\n  and atomic operations.\n\n  Default value: false\n  Possible values: true false<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"hotspots\"><\/span>\nhotspots<br \/>\n  <span class=\"ez-toc-section-end\"><\/span><\/h1>\n<pre><code>  # \/opt\/intel\/oneapi\/vtune\/latest\/bin64\/vtune -help collect hotspots\nIntel(R) VTune(TM) Profiler Command Line Tool\nCopyright (C) 2009 Intel Corporation. All rights reserved.\n\n Identify the most time consuming functions and drill down to see time spent\n on each line of source code. Focus optimization efforts on hot code for the\n greatest performance impact.\n\n To modify the analysis type, use the configuration options (knobs) as\n follows:\n -collect hotspots -knob <knobName>=<knobValue>\n Multiple -knob options are allowed and can be followed by additional collect\n action options, as well as global options, if needed.\n\nsampling-mode\n\n  User-Mode Sampling(sw) mode use for: profiles longer than a few seconds,\n  profiling a single process or a process-tree, profiling Python and Intel\n  runtimes. Hardware Event-Based Sampling(hw) mode use for: profiles shorter\n  than a few seconds, profiling all processes on a system, including kernel.\n\n  Default value: sw\n  Possible values: sw hw\n\nsampling-interval\n\n  Specify an interval (in milliseconds) between CPU samples for the Hardware\n  sampling mode. Sampling interval for the Software sampling mode is fixed\n  (10ms).\n\n  Default value: 5\n  Possible values: numbers between 0.01 and 1000\n\nenable-stack-collection\n\n  Enable collection of call stacks.\n\n  Default value: false\n  Possible values: true false\n\nstack-size\n\n  Specify the size of a raw stack (in bytes) to process. Zero value in\n  command line means unlimited size. You may set arbitrary stack size value\n  in the custom analysis configuration.\n\n  Default value: 1024\n  Possible values: 0 1024 2048 4096\n\nenable-characterization-insights\n\n  Get additional performance insights such as the efficency of hardware usage\n  and vectorization, and learn next steps. Note: this option collects CPU\n  events in the counting mode.\n\n  Default value: true\n  Possible values: true false<\/code><\/pre>\n<h1><span class=\"ez-toc-section\" id=\"%E5%BF%AB%E9%80%9F%E5%91%BD%E4%BB%A4\"><\/span>\n  \u5feb\u901f\u547d\u4ee4<br \/>\n  <span class=\"ez-toc-section-end\"><\/span><\/h1>\n<h2><span class=\"ez-toc-section\" id=\"memory-access-2\"><\/span>\n  memory-access<br \/>\n  <span class=\"ez-toc-section-end\"><\/span><\/h2>\n<pre><code>  \/opt\/intel\/oneapi\/vtune\/latest\/bin64\/vtune -collect memory-access -v -d 20 -r \/media\/disk1\/fordata\/web_server\/like12\/package\/vtune\/results\/gt441-memory-access-gameid-v2-numa-round5-debug -knob sampling-interval=0.03   -data-limit=3072 -call-stack-mode=all -inline-mode=on  -finalization-mode=full<\/code><\/pre>\n<h2>\n  \u901a\u7528<br \/>\n  <\/h1\n  ```\n  export VTUNE_RESULT=\"gt441-memory-access-gameid-v2-numa-round6-again\"\n\n  \/opt\/intel\/oneapi\/vtune\/latest\/bin64\/vtune -collect memory-access -v -d 20 -r \/media\/disk1\/fordata\/web_server\/like12\/package\/vtune\/results\/${VTUNE_RESULT} -knob sampling-interval=0.03   -data-limit=3072 -call-stack-mode=all -inline-mode=on  -finalization-mode=full\n\n  ```\n<\/p>\n","protected":false},"excerpt":{"rendered":"<p>collect # \/opt\/intel\/oneapi\/vtune\/latest\/bin64\/vtune -h &#8230; <a title=\"vtune\" class=\"read-more\" href=\"https:\/\/189505.xyz\/?p=377\" aria-label=\"More on vtune\">Read more<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"_links":{"self":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts\/377"}],"collection":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=377"}],"version-history":[{"count":6,"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts\/377\/revisions"}],"predecessor-version":[{"id":422,"href":"https:\/\/189505.xyz\/index.php?rest_route=\/wp\/v2\/posts\/377\/revisions\/422"}],"wp:attachment":[{"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=377"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=377"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/189505.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=377"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}