perf symbols: Add Rust demangling
authorDavid Tolnay <dtolnay@gmail.com>
Sat, 9 Jul 2016 07:20:00 +0000 (00:20 -0700)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Tue, 12 Jul 2016 19:12:38 +0000 (16:12 -0300)
Rust demangling is another step after bfd demangling. Add a diagnosis to
identify mangled Rust symbols based on the hash that the Rust mangler appends
as the last path component, as well as other characteristics.  Add a demangler
to reconstruct the original symbol.

Committer notes:

How I tested it:

Enabled COPR on Fedora 24 and then installed the 'rust-binary' package,
with it:

  $ cat src/main.rs
  fn main() {
      println!("Hello, world!");
  }
  $ cat Cargo.toml
  [package]

  name = "hello_world"
  version = "0.0.1"
  authors = [ "Arnaldo Carvalho de Melo <acme@kernel.org>" ]

  $ perf record cargo bench
   Compiling hello_world v0.0.1 (file:///home/acme/projects/hello_world)
     Running target/release/hello_world-d4b9dab4b2a47d75

  running 0 tests

  test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured

  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.096 MB perf.data (1457 samples) ]
  $

Before this patch:

  $ perf report --stdio --dsos librbml-e8edd0fd.so
  # dso: librbml-e8edd0fd.so
  #
  # Total Lost Samples: 0
  #
  # Samples: 1K of event 'cycles:u'
  # Event count (approx.): 979599126
  #
  # Overhead  Command  Symbol
  # ........  .......  .............................................................................................................
  #
       1.78%  rustc    [.] rbml::reader::maybe_get_doc::hb9d387df6024b15b
       1.50%  rustc    [.] _$LT$reader..DocsIterator$LT$$u27$a$GT$$u20$as$u20$std..iter..Iterator$GT$::next::hd9af9e60d79a35c8
       1.20%  rustc    [.] rbml::reader::doc_at::hc88107fba445af31
       0.46%  rustc    [.] _$LT$reader..TaggedDocsIterator$LT$$u27$a$GT$$u20$as$u20$std..iter..Iterator$GT$::next::h0cb40e696e4bb489
       0.35%  rustc    [.] rbml::reader::Decoder::_next_int::h66eef7825a398bc3
       0.29%  rustc    [.] rbml::reader::Decoder::_next_sub::h8e5266005580b836
       0.15%  rustc    [.] rbml::reader::get_doc::h094521c645459139
       0.14%  rustc    [.] _$LT$reader..Decoder$LT$$u27$doc$GT$$u20$as$u20$serialize..Decoder$GT$::read_u32::h0acea2fff9669327
       0.07%  rustc    [.] rbml::reader::Decoder::next_doc::h6714d469c9dfaf91
       0.07%  rustc    [.] _ZN4rbml6reader10doc_as_u6417h930b740aa94f1d3aE@plt
       0.06%  rustc    [.] _fini
  $

After:

  $ perf report --stdio --dsos librbml-e8edd0fd.so
  # dso: librbml-e8edd0fd.so
  #
  # Total Lost Samples: 0
  #
  # Samples: 1K of event 'cycles:u'
  # Event count (approx.): 979599126
  #
  # Overhead  Command  Symbol
  # ........  .......  .................................................................
  #
     1.78%  rustc    [.] rbml::reader::maybe_get_doc
     1.50%  rustc    [.] <reader::DocsIterator<'a> as std::iter::Iterator>::next
     1.20%  rustc    [.] rbml::reader::doc_at
     0.46%  rustc    [.] <reader::TaggedDocsIterator<'a> as std::iter::Iterator>::next
     0.35%  rustc    [.] rbml::reader::Decoder::_next_int
     0.29%  rustc    [.] rbml::reader::Decoder::_next_sub
     0.15%  rustc    [.] rbml::reader::get_doc
     0.14%  rustc    [.] <reader::Decoder<'doc> as serialize::Decoder>::read_u32
     0.07%  rustc    [.] rbml::reader::Decoder::next_doc
     0.07%  rustc    [.] _ZN4rbml6reader10doc_as_u6417h930b740aa94f1d3aE@plt
     0.06%  rustc    [.] _fini
  $

Signed-off-by: David Tolnay <dtolnay@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5780B7FA.3030602@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/util/Build
tools/perf/util/demangle-rust.c [new file with mode: 0644]
tools/perf/util/demangle-rust.h [new file with mode: 0644]
tools/perf/util/symbol-elf.c

index eda68f58288492caa395d241bd560cd39eb4ec6e..2fa7d8b6987314b0e6b1724eef627e22768b380a 100644 (file)
@@ -113,6 +113,7 @@ libperf-y += scripting-engines/
 libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
 libperf-y += demangle-java.o
+libperf-y += demangle-rust.o
 
 ifdef CONFIG_JITDUMP
 libperf-$(CONFIG_LIBELF) += jitdump.o
diff --git a/tools/perf/util/demangle-rust.c b/tools/perf/util/demangle-rust.c
new file mode 100644 (file)
index 0000000..f9dafa8
--- /dev/null
@@ -0,0 +1,269 @@
+#include <string.h>
+#include "util.h"
+#include "debug.h"
+
+#include "demangle-rust.h"
+
+/*
+ * Mangled Rust symbols look like this:
+ *
+ *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
+ *
+ * The original symbol is:
+ *
+ *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
+ *
+ * The last component of the path is a 64-bit hash in lowercase hex, prefixed
+ * with "h". Rust does not have a global namespace between crates, an illusion
+ * which Rust maintains by using the hash to distinguish things that would
+ * otherwise have the same symbol.
+ *
+ * Any path component not starting with a XID_Start character is prefixed with
+ * "_".
+ *
+ * The following escape sequences are used:
+ *
+ *     ","  =>  $C$
+ *     "@"  =>  $SP$
+ *     "*"  =>  $BP$
+ *     "&"  =>  $RF$
+ *     "<"  =>  $LT$
+ *     ">"  =>  $GT$
+ *     "("  =>  $LP$
+ *     ")"  =>  $RP$
+ *     " "  =>  $u20$
+ *     "'"  =>  $u27$
+ *     "["  =>  $u5b$
+ *     "]"  =>  $u5d$
+ *     "~"  =>  $u7e$
+ *
+ * A double ".." means "::" and a single "." means "-".
+ *
+ * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
+ */
+
+static const char *hash_prefix = "::h";
+static const size_t hash_prefix_len = 3;
+static const size_t hash_len = 16;
+
+static bool is_prefixed_hash(const char *start);
+static bool looks_like_rust(const char *sym, size_t len);
+static bool unescape(const char **in, char **out, const char *seq, char value);
+
+/*
+ * INPUT:
+ *     sym: symbol that has been through BFD-demangling
+ *
+ * This function looks for the following indicators:
+ *
+ *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
+ *
+ *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
+ *     hex digits. This is true of 99.9998% of hashes so once in your life you
+ *     may see a false negative. The point is to notice path components that
+ *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
+ *     this case a false positive (non-Rust symbol has an important path
+ *     component removed because it looks like a Rust hash) is worse than a
+ *     false negative (the rare Rust symbol is not demangled) so this sets the
+ *     balance in favor of false negatives.
+ *
+ *  3. There must be no characters other than a-zA-Z0-9 and _.:$
+ *
+ *  4. There must be no unrecognized $-sign sequences.
+ *
+ *  5. There must be no sequence of three or more dots in a row ("...").
+ */
+bool
+rust_is_mangled(const char *sym)
+{
+       size_t len, len_without_hash;
+
+       if (!sym)
+               return false;
+
+       len = strlen(sym);
+       if (len <= hash_prefix_len + hash_len)
+               /* Not long enough to contain "::h" + hash + something else */
+               return false;
+
+       len_without_hash = len - (hash_prefix_len + hash_len);
+       if (!is_prefixed_hash(sym + len_without_hash))
+               return false;
+
+       return looks_like_rust(sym, len_without_hash);
+}
+
+/*
+ * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
+ * digits must comprise between 5 and 15 (inclusive) distinct digits.
+ */
+static bool is_prefixed_hash(const char *str)
+{
+       const char *end;
+       bool seen[16];
+       size_t i;
+       int count;
+
+       if (strncmp(str, hash_prefix, hash_prefix_len))
+               return false;
+       str += hash_prefix_len;
+
+       memset(seen, false, sizeof(seen));
+       for (end = str + hash_len; str < end; str++)
+               if (*str >= '0' && *str <= '9')
+                       seen[*str - '0'] = true;
+               else if (*str >= 'a' && *str <= 'f')
+                       seen[*str - 'a' + 10] = true;
+               else
+                       return false;
+
+       /* Count how many distinct digits seen */
+       count = 0;
+       for (i = 0; i < 16; i++)
+               if (seen[i])
+                       count++;
+
+       return count >= 5 && count <= 15;
+}
+
+static bool looks_like_rust(const char *str, size_t len)
+{
+       const char *end = str + len;
+
+       while (str < end)
+               switch (*str) {
+               case '$':
+                       if (!strncmp(str, "$C$", 3))
+                               str += 3;
+                       else if (!strncmp(str, "$SP$", 4)
+                                       || !strncmp(str, "$BP$", 4)
+                                       || !strncmp(str, "$RF$", 4)
+                                       || !strncmp(str, "$LT$", 4)
+                                       || !strncmp(str, "$GT$", 4)
+                                       || !strncmp(str, "$LP$", 4)
+                                       || !strncmp(str, "$RP$", 4))
+                               str += 4;
+                       else if (!strncmp(str, "$u20$", 5)
+                                       || !strncmp(str, "$u27$", 5)
+                                       || !strncmp(str, "$u5b$", 5)
+                                       || !strncmp(str, "$u5d$", 5)
+                                       || !strncmp(str, "$u7e$", 5))
+                               str += 5;
+                       else
+                               return false;
+                       break;
+               case '.':
+                       /* Do not allow three or more consecutive dots */
+                       if (!strncmp(str, "...", 3))
+                               return false;
+                       /* Fall through */
+               case 'a' ... 'z':
+               case 'A' ... 'Z':
+               case '0' ... '9':
+               case '_':
+               case ':':
+                       str++;
+                       break;
+               default:
+                       return false;
+               }
+
+       return true;
+}
+
+/*
+ * INPUT:
+ *     sym: symbol for which rust_is_mangled(sym) returns true
+ *
+ * The input is demangled in-place because the mangled name is always longer
+ * than the demangled one.
+ */
+void
+rust_demangle_sym(char *sym)
+{
+       const char *in;
+       char *out;
+       const char *end;
+
+       if (!sym)
+               return;
+
+       in = sym;
+       out = sym;
+       end = sym + strlen(sym) - (hash_prefix_len + hash_len);
+
+       while (in < end)
+               switch (*in) {
+               case '$':
+                       if (!(unescape(&in, &out, "$C$", ',')
+                                       || unescape(&in, &out, "$SP$", '@')
+                                       || unescape(&in, &out, "$BP$", '*')
+                                       || unescape(&in, &out, "$RF$", '&')
+                                       || unescape(&in, &out, "$LT$", '<')
+                                       || unescape(&in, &out, "$GT$", '>')
+                                       || unescape(&in, &out, "$LP$", '(')
+                                       || unescape(&in, &out, "$RP$", ')')
+                                       || unescape(&in, &out, "$u20$", ' ')
+                                       || unescape(&in, &out, "$u27$", '\'')
+                                       || unescape(&in, &out, "$u5b$", '[')
+                                       || unescape(&in, &out, "$u5d$", ']')
+                                       || unescape(&in, &out, "$u7e$", '~'))) {
+                               pr_err("demangle-rust: unexpected escape sequence");
+                               goto done;
+                       }
+                       break;
+               case '_':
+                       /*
+                        * If this is the start of a path component and the next
+                        * character is an escape sequence, ignore the
+                        * underscore. The mangler inserts an underscore to make
+                        * sure the path component begins with a XID_Start
+                        * character.
+                        */
+                       if ((in == sym || in[-1] == ':') && in[1] == '$')
+                               in++;
+                       else
+                               *out++ = *in++;
+                       break;
+               case '.':
+                       if (in[1] == '.') {
+                               /* ".." becomes "::" */
+                               *out++ = ':';
+                               *out++ = ':';
+                               in += 2;
+                       } else {
+                               /* "." becomes "-" */
+                               *out++ = '-';
+                               in++;
+                       }
+                       break;
+               case 'a' ... 'z':
+               case 'A' ... 'Z':
+               case '0' ... '9':
+               case ':':
+                       *out++ = *in++;
+                       break;
+               default:
+                       pr_err("demangle-rust: unexpected character '%c' in symbol\n",
+                               *in);
+                       goto done;
+               }
+
+done:
+       *out = '\0';
+}
+
+static bool unescape(const char **in, char **out, const char *seq, char value)
+{
+       size_t len = strlen(seq);
+
+       if (strncmp(*in, seq, len))
+               return false;
+
+       **out = value;
+
+       *in += len;
+       *out += 1;
+
+       return true;
+}
diff --git a/tools/perf/util/demangle-rust.h b/tools/perf/util/demangle-rust.h
new file mode 100644 (file)
index 0000000..7b41ead
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef __PERF_DEMANGLE_RUST
+#define __PERF_DEMANGLE_RUST 1
+
+bool rust_is_mangled(const char *str);
+void rust_demangle_sym(char *str);
+
+#endif /* __PERF_DEMANGLE_RUST */
index cebf98ec27bc464a1754dcdf1e70a3d184c49600..a34321e9b44d8a42c1f4839b844b15e196a6417b 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "symbol.h"
 #include "demangle-java.h"
+#include "demangle-rust.h"
 #include "machine.h"
 #include "vdso.h"
 #include <symbol/kallsyms.h>
@@ -1081,6 +1082,13 @@ new_symbol:
                        demangled = bfd_demangle(NULL, elf_name, demangle_flags);
                        if (demangled == NULL)
                                demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
+                       else if (rust_is_mangled(demangled))
+                               /*
+                                * Input to Rust demangling is the BFD-demangled
+                                * name which it Rust-demangles in place.
+                                */
+                               rust_demangle_sym(demangled);
+
                        if (demangled != NULL)
                                elf_name = demangled;
                }