[set] Optimize add_range() some more

It's as good as it gets, and seems to be on par with previous set implementation
in my benchmark.

Would be great if someone can double-check my bitops.
diff --git a/src/hb-set-private.hh b/src/hb-set-private.hh
index 0fe010f..a47b17a 100644
--- a/src/hb-set-private.hh
+++ b/src/hb-set-private.hh
@@ -67,9 +67,19 @@
 
     inline void add_range (hb_codepoint_t a, hb_codepoint_t b)
     {
-      /* TODO Speed up. */
-     for (unsigned int i = a; i < b + 1; i++)
-       add (i);
+     elt_t *la = &elt (a);
+     elt_t *lb = &elt (b);
+     if (la == lb)
+       *la |= (mask (b) << 1) - mask(a);
+     else
+     {
+       *la |= ~(mask (a) - 1);
+
+       memset (la, 0xff, (char *) lb - (char *) la);
+
+       *lb |= ((mask (b) << 1) - 1);
+
+     }
     }
 
     inline bool is_equal (const page_t *other) const