summaryrefslogtreecommitdiffstats
path: root/toolchain/musl/patches/001-git-2015-06-04.patch
blob: 0baea676703649612f15c8b671a7143389e1024b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
commit b6a6cd703ffefa6352249fb01f4da28d85d17306
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu Jun 4 11:45:17 2015 -0400

    fix dynamic linker regression processing R_*_NONE type relocations
    
    commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 inadvertently removed
    the early check for "none" type relocations, causing the address
    dso->base+0 to be dereferenced to obtain an addend. shared libraries,
    (including libc.so) and PIE executables were unaffected, since their
    base addresses are the actual address of their mappings and are
    readable. non-PIE main executables, however, have a base address of 0
    because their load addresses are absolute and not offset at load time.
    
    in practice none-type relocations do not arise with toolchains that
    are in use except on mips, and on mips it's moderately rare for a
    non-PIE executable to have a relocation table, since the mips-specific
    got processing serves in its place for most purposes.

commit 585ba14df4799d50ec9682ce75825d2eafec2a6a
Author: Rich Felker <dalias@aerifal.cx>
Date:   Wed Jun 3 02:00:44 2015 -0400

    add additional Makefile dependency rules for rcrt1.o PIE start file

commit 2b4fcfdacf93c3dfd6ac15e31790a9e154374679
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu May 28 23:08:12 2015 -0400

    fix failure of ungetc and ungetwc to work on files in eof status
    
    these functions were written to handle clearing eof status, but failed
    to account for the __toread function's handling of eof. with this
    patch applied, __toread still returns EOF when the file is in eof
    status, so that read operations will fail, but it also sets up valid
    buffer pointers for read mode, which are set to the end of the buffer
    rather than the beginning in order to make the whole buffer available
    to ungetc/ungetwc.
    
    minor changes to __uflow were needed since it's now possible to have
    non-zero buffer pointers while in eof status. as made, these changes
    remove a 'fast path' bypassing the function call to __toread, which
    could be reintroduced with slightly different logic, but since
    ordinary files have a syscall in f->read, optimizing the code path
    does not seem worthwhile.
    
    the __stdio_read function is also updated not to zero the read buffer
    pointers on eof/error. while not necessary for correctness, this
    change avoids the overhead of calling __toread in ungetc after
    reaching eof, and it also reduces code size and increases consistency
    with the fmemopen read operation which does not zero the pointers.

commit b6e7c664677ab7c77f183b8c41105f2be519800c
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu May 28 15:37:23 2015 -0400

    add missing legacy LFS64 macros in sys/resource.h
    
    based on patch by Felix Janda, with RLIM64_SAVED_CUR and
    RLIM64_SAVED_MAX added for completeness.

commit fc431d3f76bb9bde34a89e4a3e4d0c27de959855
Author: Shiz <hi@shiz.me>
Date:   Thu May 28 05:52:22 2015 +0200

    configure: work around compilers that merely warn for unknown options
    
    some compilers (such as clang) accept unknown options without error,
    but then print warnings on each invocation, cluttering the build
    output and burying meaningful warnings. this patch makes configure's
    tryflag and tryldflag functions use additional options to turn the
    unknown-option warnings into errors, if available, but only at check
    time. these options are not output in config.mak to avoid the risk of
    spurious build breakage; if they work, they will have already done
    their job at configure time.

commit aeeac9ca5490d7d90fe061ab72da446c01ddf746
Author: Rich Felker <dalias@aerifal.cx>
Date:   Wed May 27 15:54:47 2015 -0400

    implement fail-safe static locales for newlocale
    
    this frees applications which need to make temporary use of the C
    locale (via uselocale) from the possibility that newlocale might fail.
    
    the C.UTF-8 locale is also provided as a static locale. presently they
    behave the same, but this may change in the future.

commit 11858d31aa020df3e7e7dedf49f9870ce12f31cc
Author: Rich Felker <dalias@aerifal.cx>
Date:   Wed May 27 03:32:46 2015 -0400

    rename internal locale file handling locale maps
    
    since the __setlocalecat function was removed, the filename
    __setlocalecat.c no longer made sense.

commit 61a3364d246e72b903da8b76c2e27a225a51351e
Author: Rich Felker <dalias@aerifal.cx>
Date:   Wed May 27 03:22:52 2015 -0400

    overhaul locale internals to treat categories roughly uniformly
    
    previously, LC_MESSAGES was treated specially as the only category
    which could be set to a locale name without a definition file, in
    order to facilitate gettext message translations when no libc locale
    was available. LC_NUMERIC was completely un-settable, and LC_CTYPE
    stored a flag intended to be used for a possible future byte-based C
    locale, instead of storing a __locale_map pointer like the other
    categories use.
    
    this patch changes all categories to be represented by pointers to
    __locale_map structures, and allows locale names without definition
    files to be treated as valid locales with trivial definition when used
    in any category. outwardly visible functional changes should be minor,
    limited mainly to the strings read back from setlocale and the way
    gettext handles translations in categories other than LC_MESSAGES.
    
    various internal refactoring has also been performed, and improvements
    in const correctness have been made.

commit 63c188ec42e76ff768e81f6b65b11c68fc43351e
Author: Rich Felker <dalias@aerifal.cx>
Date:   Wed May 27 00:22:43 2015 -0400

    replace atomics with locks in locale-setting code
    
    this is part of a general program of removing direct use of atomics
    where they are not necessary to meet correctness or performance needs,
    but in this case it's also an optimization. only the global locale
    needs synchronization; allocated locales referenced with locale_t
    handles are immutable during their lifetimes, and using atomics to
    initialize them increases their cost of setup.

commit dc031ee0b1ba11baa00cd7f0769e461a5f396c71
Author: Rich Felker <dalias@aerifal.cx>
Date:   Tue May 26 03:37:41 2015 -0400

    add rcrt1 start file for fully static-linked PIE
    
    static-linked PIE files need startup code to relocate themselves, much
    like the dynamic linker does. rcrt1.c reuses the code in dlstart.c,
    stage 1 of the dynamic linker, which in turn reuses crt_arch.h, to
    achieve static PIE with no new code. only relative relocations are
    supported.
    
    existing toolchains that don't yet support static PIE directly can be
    repurposed by passing "-shared -Wl,-Bstatic -Wl,-Bsymbolic" instead of
    "-static -pie" and substituting rcrt1.o in place of crt1.o.
    
    all libraries being linked must be built as PIC/PIE; TEXTRELs are not
    supported at this time.

commit ed0c8249825161036356a3616e8c5247c15d0927
Author: Rich Felker <dalias@aerifal.cx>
Date:   Tue May 26 02:31:04 2015 -0400

    fix incorrect application of visibility to Scrt1.o
    
    commit de2b67f8d41e08caa56bf6540277f6561edb647f attempted to avoid
    having vis.h affect crt files, but the Makefile variable used,
    CRT_LIBS, refers to the final output copies in the lib directory, not
    the copies in the crt build directory, and thus the -DCRT was not
    applied.
    
    while unlikely to be noticed, this regression probably broke
    production of PIE executables whose main functions are not in the
    executable but rather a shared library.

commit 9bbddf730f7837cf87f4c789fbb41a312e295d6c
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 25 23:33:59 2015 -0400

    reprocess all libc/ldso symbolic relocations in dynamic linking stage 3
    
    commit f3ddd173806fd5c60b3f034528ca24542aecc5b9 introduced early
    relocations and subsequent reprocessing as part of the dynamic linker
    bootstrap overhaul, to allow use of arbitrary libc functions before
    the main application and libraries are loaded, but only reprocessed
    GOT/PLT relocation types.
    
    commit c093e2e8201524db0d638920e76bcb6b1d925f3a added reprocessing of
    non-GOT/PLT relocations to fix an actual regression that was observed
    on powerpc, but only for RELA format tables with out-of-line addends.
    REL table (inline addends at the relocation address) reprocessing is
    trickier because the first relocation pass clobbers the addends.
    
    this patch extends symbolic relocation reprocessing for libc/ldso to
    support all relocation types, whether REL or RELA format tables are
    used. it is believed not to alter behavior on any existing archs for
    the current dynamic linker and libc code. the motivations for this
    change are consistency and future-proofing. it ensures that behavior
    does not differ depending on whether REL or RELA tables are used,
    which could lead to undetected arch-specific bugs. it also ensures
    that, if in the future code depending on additional relocation types
    is added to libc.so, either at the source level or as part of the
    compiler runtime that gets pulled in (for example, soft-float with TLS
    for fenv), the new code will work properly.
    
    the implementation concept is simple: stage 2 of the dynamic linker
    counts the number of symbolic relocations in the libc/ldso REL table
    and allocates a VLA to save their addends into; stage 3 then uses the
    saved addends in place of the inline ones which were clobbered. for
    stack safety, a hard limit (currently 4k) is imposed on the number of
    such addends; this should be a couple orders of magnitude larger than
    the actual need. this number is not a runtime variable that could
    break fail-safety; it is constant for a given libc.so build.

commit 768b82c6de24e480267c4c251c440edfc71800e3
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 25 19:15:17 2015 -0400

    move call to dynamic linker stage-3 into stage-2 function
    
    this move eliminates a duplicate "by-hand" symbol lookup loop from the
    stage-1 code and replaces it with a call to find_sym, which can be
    used once we're in stage 2. it reduces the size of the stage 1 code,
    which is helpful because stage 1 will become the crt start file for
    static-PIE executables, and it will allow stage 3 to access stage 2's
    automatic storage, which will be important in an upcoming commit.

commit 967bcbf67c3ffac587de4d79abc1e5e072d83e3e
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 25 16:02:49 2015 -0400

    mark mips crt code as code
    
    otherwise disassemblers treat it as data.

commit 7b75c4877ddf22f219f944c61d939df1dee4f6d3
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 25 15:56:36 2015 -0400

    mark mips cancellable syscall code as code
    
    otherwise disassemblers treat it as data.

commit 0e0e49421f08cfd670975ecd3604f7f9015e1833
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 25 00:32:37 2015 -0400

    simplify/shrink relocation processing in dynamic linker stage 1
    
    the outer-loop approach made sense when we were also processing
    DT_JMPREL, which might be in REL or RELA form, to avoid major code
    duplication. commit 09db855b35709aa627d7055c57a98e1e471920ab removed
    processing of DT_JMPREL, and in the remaining two tables, the format
    (REL or RELA) is known by the name of the table. simply writing two
    versions of the loop results in smaller and simpler code.

commit 09db855b35709aa627d7055c57a98e1e471920ab
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 25 00:25:56 2015 -0400

    remove processing of DT_JMPREL from dynamic linker stage 1 bootstrap
    
    the DT_JMPREL relocation table necessarily consists entirely of
    JMP_SLOT (REL_PLT in internal nomenclature) relocations, which are
    symbolic; they cannot be resolved in stage 1, so there is no point in
    processing them.

commit 9f26ebded188ed78c3571a4ca1477dd6351bc647
Author: Rich Felker <dalias@aerifal.cx>
Date:   Sun May 24 23:03:47 2015 -0400

    fix stack alignment code in mips crt_arch.h
    
    the instruction used to align the stack, "and $sp, $sp, -8", does not
    actually exist; it's expanded to 2 instructions using the 'at'
    (assembler temporary) register, and thus cannot be used in a branch
    delay slot. since alignment mod 16 commutes with subtracting 8, simply
    swapping these two operations fixes the problem.
    
    crt1.o was not affected because it's still being generated from a
    dedicated asm source file. dlstart.lo was not affected because the
    stack pointer it receives is already aligned by the kernel. but
    Scrt1.o was affected in cases where the dynamic linker gave it a
    misaligned stack pointer.

commit 63caf1d207d143fe405bbe0cda9aac8deca1171a
Author: Rich Felker <dalias@aerifal.cx>
Date:   Fri May 22 01:50:05 2015 -0400

    add .text section directive to all crt_arch.h files missing it
    
    i386 and x86_64 versions already had the .text directive; other archs
    did not. normally, top-level (file scope) __asm__ starts in the .text
    section anyway, but problems were reported with some versions of
    clang, and it seems preferable to set it explicitly anyway, at least
    for the sake of consistency between archs.

commit 3b0e83264d156f9e496ab32badd89e4447b807aa
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu May 21 17:06:28 2015 -0400

    remove outdated and misleading comment in iconv.c
    
    the comment claimed that EUC/GBK/Big5 are not implemented, which has
    been incorrect since commit 19b4a0a20efc6b9df98b6a43536ecdd628ba4643.

commit 39b8ce66f2ed9c17427ec3a48be9bda29b93b9d7
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu May 21 17:01:23 2015 -0400

    in iconv_open, accept "CHAR" and "" as aliases for "UTF-8"
    
    while not a requirement, it's common convention in other iconv
    implementations to accept "CHAR" as an alias for nl_langinfo(CODESET),
    meaning the encoding used for char[] strings in the current locale,
    and also "" as an alternate form. supporting this is not costly and
    improves compatibility.

commit c648cefb27984db60474ec1747cbfde83c2856d0
Author: Rich Felker <dalias@aerifal.cx>
Date:   Wed May 20 00:17:35 2015 -0400

    fix inconsistency in a_and and a_or argument types on x86[_64]
    
    conceptually, and on other archs, these functions take a pointer to
    int, but in the i386, x86_64, and x32 versions of atomic.h, they took
    a pointer to void instead.

commit 390f93ef69153bf2087fcf3baa1776ad9a6765ab
Author: Bobby Bingham <koorogi@koorogi.info>
Date:   Sun May 17 13:46:38 2015 -0500

    inline llsc atomics when building for sh4a
    
    If we're building for sh4a, the compiler is already free to use
    instructions only available on sh4a, so we can do the same and inline the
    llsc atomics. If we're building for an older processor, we still do the
    same runtime atomics selection as before.

commit c093e2e8201524db0d638920e76bcb6b1d925f3a
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 18 16:51:54 2015 -0400

    reprocess libc/ldso RELA relocations in stage 3 of dynamic linking
    
    this fixes a regression on powerpc that was introduced in commit
    f3ddd173806fd5c60b3f034528ca24542aecc5b9. global data accesses on
    powerpc seem to be using a translation-unit-local GOT filled via
    R_PPC_ADDR32 relocations rather than R_PPC_GLOB_DAT. being a non-GOT
    relocation type, these were not reprocessed after adding the main
    application and its libraries to the chain, causing libc code not to
    see copy relocations in the main program, and therefore to use the
    pre-copy-relocation addresses for global data objects (like environ).
    
    the motivation for the dynamic linker only reprocessing GOT/PLT
    relocation types in stage 3 is that these types always have a zero
    addend, making them safe to process again even if the storage for the
    addend has been clobbered. other relocation types which can be used
    for address constants in initialized data objects may have non-zero
    addends which will be clobbered during the first pass of relocation
    processing if they're stored inline (REL form) rather than out-of-line
    (RELA form).
    
    powerpc generally uses only RELA, so this patch is sufficient to fix
    the regression in practice, but is not fully general, and would not
    suffice if an alternate toolchain generated REL for powerpc.

commit 43e9f652bf4b2195b04fc14c93db591b30a7b790
Author: Rich Felker <dalias@aerifal.cx>
Date:   Mon May 18 12:11:25 2015 -0400

    fix null pointer dereference in dcngettext under specific conditions
    
    if setlocale has not been called, the current locale's messages_name
    may be a null pointer. the code path where it's assumed to be non-null
    was only reachable if bindtextdomain had already been called, which is
    normally not done in programs which do not call setlocale, so the
    omitted check went unnoticed.
    
    patch from Void Linux, with description rewritten.

commit 68630b55c0c7219fe9df70dc28ffbf9efc8021d8
Author: Rich Felker <dalias@aerifal.cx>
Date:   Sat May 16 01:53:54 2015 -0400

    eliminate costly tricks to avoid TLS access for current locale state
    
    the code being removed used atomics to track whether any threads might
    be using a locale other than the current global locale, and whether
    any threads might have abstract 8-bit (non-UTF-8) LC_CTYPE active, a
    feature which was never committed (still pending). the motivations
    were to support early execution prior to setup of the thread pointer,
    to partially support systems (ancient kernels) where thread pointer
    setup is not possible, and to avoid high performance cost on archs
    where accessing the thread pointer may be very slow.
    
    since commit 19a1fe670acb3ab9ead0fe31859ca7d4fe40dd54, the thread
    pointer is always available, so these hacks are no longer needed.
    removing them greatly simplifies the affected code.

commit 707d7c30f3379441de9b320536ddfd354f4c2143
Author: Rich Felker <dalias@aerifal.cx>
Date:   Sat May 16 01:15:40 2015 -0400

    in i386 __set_thread_area, don't assume %gs register is initially zero
    
    commit f630df09b1fd954eda16e2f779da0b5ecc9d80d3 added logic to handle
    the case where __set_thread_area is called more than once by reusing
    the GDT slot already in the %gs register, and only setting up a new
    GDT slot when %gs is zero. this created a hidden assumption that %gs
    is zero when a new process image starts, which is true in practice on
    Linux, but does not seem to be documented ABI, and fails to hold under
    qemu app-level emulation.
    
    while it would in theory be possible to zero %gs in the entry point
    code, this code is shared between static and dynamic binaries, and
    dynamic binaries must not clobber the value of %gs already setup by
    the dynamic linker.
    
    the alternative solution implemented in this commit simply uses global
    data to store the GDT index that's selected. __set_thread_area should
    only be called in the initial thread anyway (subsequent threads get
    their thread pointer setup by __clone), but even if it were called by
    another thread, it would simply read and write back the same GDT index
    that was already assigned to the initial thread, and thus (in the x86
    memory model) there is no data race.

commit c0f10cf06725bd0de37f3ced7954a653bf9f1049
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu May 14 18:51:27 2015 -0400

    make arm reloc.h CRTJMP macro compatible with thumb
    
    compilers targeting armv7 may be configured to produce thumb2 code
    instead of arm code by default, and in the future we may wish to
    support targets where only the thumb instruction set is available.
    
    the instructions this patch omits in thumb mode are needed only for
    non-thumb versions of armv4 or earlier, which are not supported by any
    current compilers/toolchains and thus rather pointless to have. at
    some point these compatibility return sequences may be removed from
    all asm source files, and in that case it would make sense to remove
    them here too and remove the ifdef.

commit 83340c7a580e91b22f58321b7cf6d976af61084c
Author: Rich Felker <dalias@aerifal.cx>
Date:   Thu May 14 18:26:16 2015 -0400

    make arm crt_arch.h compatible with thumb code generation
    
    compilers targeting armv7 may be configured to produce thumb2 code
    instead of arm code by default, and in the future we may wish to
    support targets where only the thumb instruction set is available.
    
    the changes made here avoid operating directly on the sp register,
    which is not possible in thumb code, and address an issue with the way
    the address of _DYNAMIC is computed.
    
    previously, the relative address of _DYNAMIC was stored with an
    additional offset of -8 versus the pc-relative add instruction, since
    on arm the pc register evaluates to ".+8". in thumb code, it instead
    evaluates to ".+4". both are two (normal-size) instructions beyond "."
    in the current execution mode, so the numbered label 2 used in the
    relative address expression is simply moved two instructions ahead to
    be compatible with both instruction sets.

--- a/Makefile
+++ b/Makefile
@@ -44,7 +44,7 @@ ALL_INCLUDES = $(sort $(wildcard include
 
 EMPTY_LIB_NAMES = m rt pthread crypt util xnet resolv dl
 EMPTY_LIBS = $(EMPTY_LIB_NAMES:%=lib/lib%.a)
-CRT_LIBS = lib/crt1.o lib/Scrt1.o lib/crti.o lib/crtn.o
+CRT_LIBS = lib/crt1.o lib/Scrt1.o lib/rcrt1.o lib/crti.o lib/crtn.o
 STATIC_LIBS = lib/libc.a
 SHARED_LIBS = lib/libc.so
 TOOL_LIBS = lib/musl-gcc.specs
@@ -85,11 +85,13 @@ src/internal/version.h: $(wildcard VERSI
 
 src/internal/version.lo: src/internal/version.h
 
-src/ldso/dlstart.lo src/ldso/dynlink.lo: src/internal/dynlink.h arch/$(ARCH)/reloc.h
+crt/rcrt1.o src/ldso/dlstart.lo src/ldso/dynlink.lo: src/internal/dynlink.h arch/$(ARCH)/reloc.h
 
-crt/crt1.o crt/Scrt1.o src/ldso/dlstart.lo: $(wildcard arch/$(ARCH)/crt_arch.h)
+crt/crt1.o crt/Scrt1.o crt/rcrt1.o src/ldso/dlstart.lo: $(wildcard arch/$(ARCH)/crt_arch.h)
 
-crt/Scrt1.o: CFLAGS += -fPIC
+crt/rcrt1.o: src/ldso/dlstart.c
+
+crt/Scrt1.o crt/rcrt1.o: CFLAGS += -fPIC
 
 OPTIMIZE_SRCS = $(wildcard $(OPTIMIZE_GLOBS:%=src/%))
 $(OPTIMIZE_SRCS:%.c=%.o) $(OPTIMIZE_SRCS:%.c=%.lo): CFLAGS += -O3
@@ -104,7 +106,7 @@ NOSSP_SRCS = $(wildcard crt/*.c) \
 	src/ldso/dlstart.c src/ldso/dynlink.c
 $(NOSSP_SRCS:%.c=%.o) $(NOSSP_SRCS:%.c=%.lo): CFLAGS += $(CFLAGS_NOSSP)
 
-$(CRT_LIBS): CFLAGS += -DCRT
+$(CRT_LIBS:lib/%=crt/%): CFLAGS += -DCRT
 
 # This incantation ensures that changes to any subarch asm files will
 # force the corresponding object file to be rebuilt, even if the implicit
--- a/arch/aarch64/crt_arch.h
+++ b/arch/aarch64/crt_arch.h
@@ -1,4 +1,5 @@
 __asm__(
+".text \n"
 ".global " START "\n"
 ".type " START ",%function\n"
 START ":\n"
--- a/arch/arm/crt_arch.h
+++ b/arch/arm/crt_arch.h
@@ -1,15 +1,18 @@
 __asm__(
+".text \n"
 ".global " START " \n"
 ".type " START ",%function \n"
 START ": \n"
 "	mov fp, #0 \n"
 "	mov lr, #0 \n"
-"	mov a1, sp \n"
 "	ldr a2, 1f \n"
-"2:	add a2, pc, a2 \n"
-"	and sp, sp, #-16 \n"
+"	add a2, pc, a2 \n"
+"	mov a1, sp \n"
+"2:	and ip, a1, #-16 \n"
+"	mov sp, ip \n"
 "	bl " START "_c \n"
 ".weak _DYNAMIC \n"
 ".hidden _DYNAMIC \n"
-"1:	.word _DYNAMIC-2b-8 \n"
+".align 2 \n"
+"1:	.word _DYNAMIC-2b \n"
 );
--- a/arch/arm/reloc.h
+++ b/arch/arm/reloc.h
@@ -28,5 +28,10 @@
 #define REL_TPOFF       R_ARM_TLS_TPOFF32
 //#define REL_TLSDESC     R_ARM_TLS_DESC
 
+#ifdef __thumb__
+#define CRTJMP(pc,sp) __asm__ __volatile__( \
+	"mov sp,%1 ; bx %0" : : "r"(pc), "r"(sp) : "memory" )
+#else
 #define CRTJMP(pc,sp) __asm__ __volatile__( \
 	"mov sp,%1 ; tst %0,#1 ; moveq pc,%0 ; bx %0" : : "r"(pc), "r"(sp) : "memory" )
+#endif
--- a/arch/i386/atomic.h
+++ b/arch/i386/atomic.h
@@ -50,16 +50,16 @@ static inline int a_cas(volatile int *p,
 	return t;
 }
 
-static inline void a_or(volatile void *p, int v)
+static inline void a_or(volatile int *p, int v)
 {
 	__asm__( "lock ; orl %1, %0"
-		: "=m"(*(int *)p) : "r"(v) : "memory" );
+		: "=m"(*p) : "r"(v) : "memory" );
 }
 
-static inline void a_and(volatile void *p, int v)
+static inline void a_and(volatile int *p, int v)
 {
 	__asm__( "lock ; andl %1, %0"
-		: "=m"(*(int *)p) : "r"(v) : "memory" );
+		: "=m"(*p) : "r"(v) : "memory" );
 }
 
 static inline int a_swap(volatile int *x, int v)
--- a/arch/microblaze/crt_arch.h
+++ b/arch/microblaze/crt_arch.h
@@ -1,4 +1,5 @@
 __asm__(
+".text \n"
 ".global " START " \n"
 ".align  2 \n"
 START ": \n"
--- a/arch/mips/crt_arch.h
+++ b/arch/mips/crt_arch.h
@@ -1,6 +1,7 @@
 __asm__(
 ".set push\n"
 ".set noreorder\n"
+".text \n"
 ".global _" START "\n"
 ".global " START "\n"
 ".type   _" START ", @function\n"
@@ -21,8 +22,8 @@ __asm__(
 "	addu $5, $5, $gp \n"
 "	lw $25, 4($ra) \n"
 "	addu $25, $25, $gp \n"
-"	subu $sp, $sp, 16 \n"
+"	and $sp, $sp, -8 \n"
 "	jalr $25 \n"
-"	 and $sp, $sp, -8 \n"
+"	 subu $sp, $sp, 16 \n"
 ".set pop \n"
 );
--- a/arch/or1k/crt_arch.h
+++ b/arch/or1k/crt_arch.h
@@ -1,4 +1,5 @@
 __asm__(
+".text \n"
 ".global " START " \n"
 ".align  4 \n"
 START ": \n"
--- a/arch/powerpc/crt_arch.h
+++ b/arch/powerpc/crt_arch.h
@@ -1,4 +1,5 @@
 __asm__(
+".text \n"
 ".global " START " \n"
 ".type   " START ", %function \n"
 START ": \n"
--- a/arch/sh/atomic.h
+++ b/arch/sh/atomic.h
@@ -22,6 +22,88 @@ static inline int a_ctz_64(uint64_t x)
 	return a_ctz_l(y);
 }
 
+#define LLSC_CLOBBERS "r0", "t", "memory"
+#define LLSC_START(mem) "synco\n"  \
+	"0:	movli.l @" mem ", r0\n"
+#define LLSC_END(mem)              \
+	"1:	movco.l r0, @" mem "\n"    \
+	"	bf 0b\n"                   \
+	"	synco\n"
+
+static inline int __sh_cas_llsc(volatile int *p, int t, int s)
+{
+	int old;
+	__asm__ __volatile__(
+		LLSC_START("%1")
+		"	mov r0, %0\n"
+		"	cmp/eq %0, %2\n"
+		"	bf 1f\n"
+		"	mov %3, r0\n"
+		LLSC_END("%1")
+		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
+	return old;
+}
+
+static inline int __sh_swap_llsc(volatile int *x, int v)
+{
+	int old;
+	__asm__ __volatile__(
+		LLSC_START("%1")
+		"	mov r0, %0\n"
+		"	mov %2, r0\n"
+		LLSC_END("%1")
+		: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
+	return old;
+}
+
+static inline int __sh_fetch_add_llsc(volatile int *x, int v)
+{
+	int old;
+	__asm__ __volatile__(
+		LLSC_START("%1")
+		"	mov r0, %0\n"
+		"	add %2, r0\n"
+		LLSC_END("%1")
+		: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
+	return old;
+}
+
+static inline void __sh_store_llsc(volatile int *p, int x)
+{
+	__asm__ __volatile__(
+		"	synco\n"
+		"	mov.l %1, @%0\n"
+		"	synco\n"
+		: : "r"(p), "r"(x) : "memory");
+}
+
+static inline void __sh_and_llsc(volatile int *x, int v)
+{
+	__asm__ __volatile__(
+		LLSC_START("%0")
+		"	and %1, r0\n"
+		LLSC_END("%0")
+		: : "r"(x), "r"(v) : LLSC_CLOBBERS);
+}
+
+static inline void __sh_or_llsc(volatile int *x, int v)
+{
+	__asm__ __volatile__(
+		LLSC_START("%0")
+		"	or %1, r0\n"
+		LLSC_END("%0")
+		: : "r"(x), "r"(v) : LLSC_CLOBBERS);
+}
+
+#ifdef __SH4A__
+#define a_cas(p,t,s)     __sh_cas_llsc(p,t,s)
+#define a_swap(x,v)      __sh_swap_llsc(x,v)
+#define a_fetch_add(x,v) __sh_fetch_add_llsc(x, v)
+#define a_store(x,v)     __sh_store_llsc(x, v)
+#define a_and(x,v)       __sh_and_llsc(x, v)
+#define a_or(x,v)        __sh_or_llsc(x, v)
+#else
+
 int  __sh_cas(volatile int *, int, int);
 int  __sh_swap(volatile int *, int);
 int  __sh_fetch_add(volatile int *, int);
@@ -35,6 +117,7 @@ void __sh_or(volatile int *, int);
 #define a_store(x,v)     __sh_store(x, v)
 #define a_and(x,v)       __sh_and(x, v)
 #define a_or(x,v)        __sh_or(x, v)
+#endif
 
 static inline void *a_cas_p(volatile void *p, void *t, void *s)
 {
--- a/arch/sh/crt_arch.h
+++ b/arch/sh/crt_arch.h
@@ -1,4 +1,5 @@
 __asm__(
+".text \n"
 ".global " START " \n"
 START ": \n"
 "	mova 1f, r0 \n"
--- a/arch/sh/src/atomic.c
+++ b/arch/sh/src/atomic.c
@@ -1,12 +1,7 @@
-#include "libc.h"
+#ifndef __SH4A__
 
-#define LLSC_CLOBBERS   "r0", "t", "memory"
-#define LLSC_START(mem) "synco\n"  \
-	"0:	movli.l @" mem ", r0\n"
-#define LLSC_END(mem)              \
-	"1:	movco.l r0, @" mem "\n"    \
-	"	bf 0b\n"                   \
-	"	synco\n"
+#include "atomic.h"
+#include "libc.h"
 
 /* gusa is a hack in the kernel which lets you create a sequence of instructions
  * which will be restarted if the process is preempted in the middle of the
@@ -34,114 +29,74 @@
 
 int __sh_cas(volatile int *p, int t, int s)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_cas_llsc(p, t, s);
+
 	int old;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%1")
-			"	mov r0, %0\n"
-			"	cmp/eq %0, %2\n"
-			"	bf 1f\n"
-			"	mov %3, r0\n"
-			LLSC_END("%1")
-			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_EVEN("%1", "%0")
-			"	cmp/eq %0, %2\n"
-			"	bf 1f\n"
-			GUSA_END("%1", "%3")
-			: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
-	}
+	__asm__ __volatile__(
+		GUSA_START_EVEN("%1", "%0")
+		"	cmp/eq %0, %2\n"
+		"	bf 1f\n"
+		GUSA_END("%1", "%3")
+		: "=&r"(old) : "r"(p), "r"(t), "r"(s) : GUSA_CLOBBERS, "t");
 	return old;
 }
 
 int __sh_swap(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_swap_llsc(x, v);
+
 	int old;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%1")
-			"	mov r0, %0\n"
-			"	mov %2, r0\n"
-			LLSC_END("%1")
-			: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_EVEN("%1", "%0")
-			GUSA_END("%1", "%2")
-			: "=&r"(old) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_EVEN("%1", "%0")
+		GUSA_END("%1", "%2")
+		: "=&r"(old) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 	return old;
 }
 
 int __sh_fetch_add(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_fetch_add_llsc(x, v);
+
 	int old, dummy;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%1")
-			"	mov r0, %0\n"
-			"	add %2, r0\n"
-			LLSC_END("%1")
-			: "=&r"(old) : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_EVEN("%2", "%0")
-			"	mov %0, %1\n"
-			"	add %3, %1\n"
-			GUSA_END("%2", "%1")
-			: "=&r"(old), "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_EVEN("%2", "%0")
+		"	mov %0, %1\n"
+		"	add %3, %1\n"
+		GUSA_END("%2", "%1")
+		: "=&r"(old), "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 	return old;
 }
 
 void __sh_store(volatile int *p, int x)
 {
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			"	synco\n"
-			"	mov.l %1, @%0\n"
-			"	synco\n"
-			: : "r"(p), "r"(x) : "memory");
-	} else {
-		__asm__ __volatile__(
-			"	mov.l %1, @%0\n"
-			: : "r"(p), "r"(x) : "memory");
-	}
+	if (__hwcap & CPU_HAS_LLSC) return __sh_store_llsc(p, x);
+	__asm__ __volatile__(
+		"	mov.l %1, @%0\n"
+		: : "r"(p), "r"(x) : "memory");
 }
 
 void __sh_and(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_and_llsc(x, v);
+
 	int dummy;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%0")
-			"	and %1, r0\n"
-			LLSC_END("%0")
-			: : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_ODD("%1", "%0")
-			"	and %2, %0\n"
-			GUSA_END("%1", "%0")
-			: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_ODD("%1", "%0")
+		"	and %2, %0\n"
+		GUSA_END("%1", "%0")
+		: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 }
 
 void __sh_or(volatile int *x, int v)
 {
+	if (__hwcap & CPU_HAS_LLSC) return __sh_or_llsc(x, v);
+
 	int dummy;
-	if (__hwcap & CPU_HAS_LLSC) {
-		__asm__ __volatile__(
-			LLSC_START("%0")
-			"	or %1, r0\n"
-			LLSC_END("%0")
-			: : "r"(x), "r"(v) : LLSC_CLOBBERS);
-	} else {
-		__asm__ __volatile__(
-			GUSA_START_ODD("%1", "%0")
-			"	or %2, %0\n"
-			GUSA_END("%1", "%0")
-			: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
-	}
+	__asm__ __volatile__(
+		GUSA_START_ODD("%1", "%0")
+		"	or %2, %0\n"
+		GUSA_END("%1", "%0")
+		: "=&r"(dummy) : "r"(x), "r"(v) : GUSA_CLOBBERS);
 }
+
+#endif
--- a/arch/x32/atomic.h
+++ b/arch/x32/atomic.h
@@ -47,16 +47,16 @@ static inline int a_cas(volatile int *p,
 	return t;
 }
 
-static inline void a_or(volatile void *p, int v)
+static inline void a_or(volatile int *p, int v)
 {
 	__asm__( "lock ; or %1, %0"
-		: "=m"(*(int *)p) : "r"(v) : "memory" );
+		: "=m"(*p) : "r"(v) : "memory" );
 }
 
-static inline void a_and(volatile void *p, int v)
+static inline void a_and(volatile int *p, int v)
 {
 	__asm__( "lock ; and %1, %0"
-		: "=m"(*(int *)p) : "r"(v) : "memory" );
+		: "=m"(*p) : "r"(v) : "memory" );
 }
 
 static inline int a_swap(volatile int *x, int v)
--- a/arch/x86_64/atomic.h
+++ b/arch/x86_64/atomic.h
@@ -47,16 +47,16 @@ static inline int a_cas(volatile int *p,
 	return t;
 }
 
-static inline void a_or(volatile void *p, int v)
+static inline void a_or(volatile int *p, int v)
 {
 	__asm__( "lock ; or %1, %0"
-		: "=m"(*(int *)p) : "r"(v) : "memory" );
+		: "=m"(*p) : "r"(v) : "memory" );
 }
 
-static inline void a_and(volatile void *p, int v)
+static inline void a_and(volatile int *p, int v)
 {
 	__asm__( "lock ; and %1, %0"
-		: "=m"(*(int *)p) : "r"(v) : "memory" );
+		: "=m"(*p) : "r"(v) : "memory" );
 }
 
 static inline int a_swap(volatile int *x, int v)
--- a/configure
+++ b/configure
@@ -80,7 +80,7 @@ fi
 tryflag () {
 printf "checking whether compiler accepts %s... " "$2"
 echo "typedef int x;" > "$tmpc"
-if $CC $2 -c -o /dev/null "$tmpc" >/dev/null 2>&1 ; then
+if $CC $CFLAGS_TRY $2 -c -o /dev/null "$tmpc" >/dev/null 2>&1 ; then
 printf "yes\n"
 eval "$1=\"\${$1} \$2\""
 eval "$1=\${$1# }"
@@ -94,7 +94,7 @@ fi
 tryldflag () {
 printf "checking whether linker accepts %s... " "$2"
 echo "typedef int x;" > "$tmpc"
-if $CC -nostdlib -shared "$2" -o /dev/null "$tmpc" >/dev/null 2>&1 ; then
+if $CC $LDFLAGS_TRY -nostdlib -shared "$2" -o /dev/null "$tmpc" >/dev/null 2>&1 ; then
 printf "yes\n"
 eval "$1=\"\${$1} \$2\""
 eval "$1=\${$1# }"
@@ -113,7 +113,9 @@ CFLAGS_C99FSE=
 CFLAGS_AUTO=
 CFLAGS_MEMOPS=
 CFLAGS_NOSSP=
+CFLAGS_TRY=
 LDFLAGS_AUTO=
+LDFLAGS_TRY=
 OPTIMIZE_GLOBS=
 prefix=/usr/local/musl
 exec_prefix='$(prefix)'
@@ -205,6 +207,14 @@ exit 1
 fi
 
 #
+# Figure out options to force errors on unknown flags.
+#
+tryflag   CFLAGS_TRY  -Werror=unknown-warning-option
+tryflag   CFLAGS_TRY  -Werror=unused-command-line-argument
+tryldflag LDFLAGS_TRY -Werror=unknown-warning-option
+tryldflag LDFLAGS_TRY -Werror=unused-command-line-argument
+
+#
 # Need to know if the compiler is gcc to decide whether to build the
 # musl-gcc wrapper, and for critical bug detection in some gcc versions.
 #
--- a/crt/mips/crt1.s
+++ b/crt/mips/crt1.s
@@ -4,6 +4,8 @@
 .weak  _fini
 .global __start
 .global _start
+.type __start,@function
+.type _start,@function
 __start:
 _start:
 	subu    $fp, $fp, $fp            # Zero the frame pointer.
--- a/crt/mips/crti.s
+++ b/crt/mips/crti.s
@@ -2,6 +2,7 @@
 
 .section .init
 .global _init
+.type _init,@function
 .align 2
 _init:
 	subu $sp,$sp,32
@@ -10,6 +11,7 @@ _init:
 
 .section .fini
 .global _fini
+.type _fini,@function
 .align 2
 _fini:
 	subu $sp,$sp,32
--- /dev/null
+++ b/crt/rcrt1.c
@@ -0,0 +1,15 @@
+#define SHARED
+#define START "_start"
+#define _dlstart_c _start_c
+#include "../src/ldso/dlstart.c"
+
+int main();
+void _init() __attribute__((weak));
+void _fini() __attribute__((weak));
+_Noreturn int __libc_start_main(int (*)(), int, char **,
+	void (*)(), void(*)(), void(*)());
+
+_Noreturn void __dls2(unsigned char *base, size_t *sp)
+{
+	__libc_start_main(main, *sp, (void *)(sp+1), _init, _fini, 0);
+}
--- a/include/sys/resource.h
+++ b/include/sys/resource.h
@@ -96,6 +96,9 @@ int prlimit(pid_t, int, const struct rli
 #define RLIM_NLIMITS RLIMIT_NLIMITS
 
 #if defined(_LARGEFILE64_SOURCE) || defined(_GNU_SOURCE)
+#define RLIM64_INFINITY RLIM_INFINITY
+#define RLIM64_SAVED_CUR RLIM_SAVED_CUR
+#define RLIM64_SAVED_MAX RLIM_SAVED_MAX
 #define getrlimit64 getrlimit
 #define setrlimit64 setrlimit
 #define rlimit64 rlimit
--- a/src/internal/dynlink.h
+++ b/src/internal/dynlink.h
@@ -51,7 +51,7 @@ enum {
 #define AUX_CNT 32
 #define DYN_CNT 32
 
-typedef void (*stage2_func)(unsigned char *);
+typedef void (*stage2_func)(unsigned char *, size_t *);
 typedef _Noreturn void (*stage3_func)(size_t *);
 
 #endif
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -8,9 +8,7 @@
 struct __locale_map;
 
 struct __locale_struct {
-	volatile int ctype_utf8;
-	char *messages_name;
-	struct __locale_map *volatile cat[4];
+	const struct __locale_map *volatile cat[6];
 };
 
 struct __libc {
@@ -23,8 +21,6 @@ struct __libc {
 	volatile int ofl_lock[2];
 	size_t tls_size;
 	size_t page_size;
-	volatile int uselocale_cnt;
-	volatile int bytelocale_cnt_minus_1;
 	struct __locale_struct global_locale;
 };
 
--- a/src/internal/locale_impl.h
+++ b/src/internal/locale_impl.h
@@ -9,22 +9,20 @@ struct __locale_map {
 	const void *map;
 	size_t map_size;
 	char name[LOCALE_NAME_MAX+1];
-	struct __locale_map *next;
+	const struct __locale_map *next;
 };
 
-int __setlocalecat(locale_t, int, const char *);
+const struct __locale_map *__get_locale(int, const char *);
 const char *__mo_lookup(const void *, size_t, const char *);
 const char *__lctrans(const char *, const struct __locale_map *);
 const char *__lctrans_cur(const char *);
 
-#define LCTRANS(msg, lc, loc) __lctrans(msg, (loc)->cat[(lc)-2])
+#define LCTRANS(msg, lc, loc) __lctrans(msg, (loc)->cat[(lc)])
 #define LCTRANS_CUR(msg) __lctrans_cur(msg)
 
-#define CURRENT_LOCALE \
-	(libc.uselocale_cnt ? __pthread_self()->locale : &libc.global_locale)
+#define CURRENT_LOCALE (__pthread_self()->locale)
 
-#define CURRENT_UTF8 \
-	(libc.bytelocale_cnt_minus_1<0 || __pthread_self()->locale->ctype_utf8)
+#define CURRENT_UTF8 (!!__pthread_self()->locale->cat[LC_CTYPE])
 
 #undef MB_CUR_MAX
 #define MB_CUR_MAX (CURRENT_UTF8 ? 4 : 1)
--- a/src/ldso/dlstart.c
+++ b/src/ldso/dlstart.c
@@ -56,31 +56,22 @@ void _dlstart_c(size_t *sp, size_t *dynv
 		for (i=0; i<local_cnt; i++) got[i] += (size_t)base;
 	}
 
-	/* The use of the reloc_info structure and nested loops is a trick
-	 * to work around the fact that we can't necessarily make function
-	 * calls yet. Each struct in the array serves like the arguments
-	 * to a function call. */
-	struct {
-		void *rel;
-		size_t size;
-		size_t stride;
-	} reloc_info[] = {
-		{ base+dyn[DT_JMPREL], dyn[DT_PLTRELSZ], 2+(dyn[DT_PLTREL]==DT_RELA) },
-		{ base+dyn[DT_REL], dyn[DT_RELSZ], 2 },
-		{ base+dyn[DT_RELA], dyn[DT_RELASZ], 3 },
-		{ 0, 0, 0 }
-	};
-
-	for (i=0; reloc_info[i].stride; i++) {
-		size_t *rel = reloc_info[i].rel;
-		size_t rel_size = reloc_info[i].size;
-		size_t stride = reloc_info[i].stride;
-		for (; rel_size; rel+=stride, rel_size-=stride*sizeof(size_t)) {
-			if (!IS_RELATIVE(rel[1])) continue;
-			size_t *rel_addr = (void *)(base + rel[0]);
-			size_t addend = stride==3 ? rel[2] : *rel_addr;
-			*rel_addr = (size_t)base + addend;
-		}
+	size_t *rel, rel_size;
+
+	rel = (void *)(base+dyn[DT_REL]);
+	rel_size = dyn[DT_RELSZ];
+	for (; rel_size; rel+=2, rel_size-=2*sizeof(size_t)) {
+		if (!IS_RELATIVE(rel[1])) continue;
+		size_t *rel_addr = (void *)(base + rel[0]);
+		*rel_addr += (size_t)base;
+	}
+
+	rel = (void *)(base+dyn[DT_RELA]);
+	rel_size = dyn[DT_RELASZ];
+	for (; rel_size; rel+=3, rel_size-=3*sizeof(size_t)) {
+		if (!IS_RELATIVE(rel[1])) continue;
+		size_t *rel_addr = (void *)(base + rel[0]);
+		*rel_addr = (size_t)base + rel[2];
 	}
 
 	const char *strings = (void *)(base + dyn[DT_STRTAB]);
@@ -93,16 +84,7 @@ void _dlstart_c(size_t *sp, size_t *dynv
 		 && s[3]=='l' && s[4]=='s' && s[5]=='2' && !s[6])
 			break;
 	}
-	((stage2_func)(base + syms[i].st_value))(base);
-
-	/* Call dynamic linker stage-3, __dls3 */
-	for (i=0; ;i++) {
-		const char *s = strings + syms[i].st_name;
-		if (s[0]=='_' && s[1]=='_' && s[2]=='d'
-		 && s[3]=='l' && s[4]=='s' && s[5]=='3' && !s[6])
-			break;
-	}
-	((stage3_func)(base + syms[i].st_value))(sp);
+	((stage2_func)(base + syms[i].st_value))(base, sp);
 }
 
 #endif
--- a/src/ldso/dynlink.c
+++ b/src/ldso/dynlink.c
@@ -74,7 +74,6 @@ struct dso {
 	volatile int new_dtv_idx, new_tls_idx;
 	struct td_index *td_index;
 	struct dso *fini_next;
-	int rel_early_relative, rel_update_got;
 	char *shortname;
 	char buf[];
 };
@@ -96,6 +95,9 @@ static struct builtin_tls {
 } builtin_tls[1];
 #define MIN_TLS_ALIGN offsetof(struct builtin_tls, pt)
 
+#define ADDEND_LIMIT 4096
+static size_t *saved_addends, *apply_addends_to;
+
 static struct dso ldso;
 static struct dso *head, *tail, *fini_head;
 static char *env_path, *sys_path;
@@ -256,10 +258,19 @@ static void do_relocs(struct dso *dso, s
 	size_t sym_val;
 	size_t tls_val;
 	size_t addend;
+	int skip_relative = 0, reuse_addends = 0, save_slot = 0;
+
+	if (dso == &ldso) {
+		/* Only ldso's REL table needs addend saving/reuse. */
+		if (rel == apply_addends_to)
+			reuse_addends = 1;
+		skip_relative = 1;
+	}
 
 	for (; rel_size; rel+=stride, rel_size-=stride*sizeof(size_t)) {
-		if (dso->rel_early_relative && IS_RELATIVE(rel[1])) continue;
+		if (skip_relative && IS_RELATIVE(rel[1])) continue;
 		type = R_TYPE(rel[1]);
+		if (type == REL_NONE) continue;
 		sym_index = R_SYM(rel[1]);
 		reloc_addr = (void *)(base + rel[0]);
 		if (sym_index) {
@@ -280,12 +291,20 @@ static void do_relocs(struct dso *dso, s
 			def.dso = dso;
 		}
 
-		int gotplt = (type == REL_GOT || type == REL_PLT);
-		if (dso->rel_update_got && !gotplt) continue;
-
-		addend = stride>2 ? rel[2]
-			: gotplt || type==REL_COPY ? 0
-			: *reloc_addr;
+		if (stride > 2) {
+			addend = rel[2];
+		} else if (type==REL_GOT || type==REL_PLT|| type==REL_COPY) {
+			addend = 0;
+		} else if (reuse_addends) {
+			/* Save original addend in stage 2 where the dso
+			 * chain consists of just ldso; otherwise read back
+			 * saved addend since the inline one was clobbered. */
+			if (head==&ldso)
+				saved_addends[save_slot] = *reloc_addr;
+			addend = saved_addends[save_slot++];
+		} else {
+			addend = *reloc_addr;
+		}
 
 		sym_val = def.sym ? (size_t)def.dso->base+def.sym->st_value : 0;
 		tls_val = def.sym ? def.sym->st_value : 0;
@@ -879,7 +898,7 @@ static void do_mips_relocs(struct dso *p
 	size_t i, j, rel[2];
 	unsigned char *base = p->base;
 	i=0; search_vec(p->dynv, &i, DT_MIPS_LOCAL_GOTNO);
-	if (p->rel_early_relative) {
+	if (p==&ldso) {
 		got += i;
 	} else {
 		while (i--) *got++ += (size_t)base;
@@ -1116,7 +1135,7 @@ static void update_tls_size()
  * linker itself, but some of the relocations performed may need to be
  * replaced later due to copy relocations in the main program. */
 
-void __dls2(unsigned char *base)
+void __dls2(unsigned char *base, size_t *sp)
 {
 	Ehdr *ehdr = (void *)base;
 	ldso.base = base;
@@ -1125,15 +1144,35 @@ void __dls2(unsigned char *base)
 	ldso.phnum = ehdr->e_phnum;
 	ldso.phdr = (void *)(base + ehdr->e_phoff);
 	ldso.phentsize = ehdr->e_phentsize;
-	ldso.rel_early_relative = 1;
 	kernel_mapped_dso(&ldso);
 	decode_dyn(&ldso);
 
+	/* Prepare storage for to save clobbered REL addends so they
+	 * can be reused in stage 3. There should be very few. If
+	 * something goes wrong and there are a huge number, abort
+	 * instead of risking stack overflow. */
+	size_t dyn[DYN_CNT];
+	decode_vec(ldso.dynv, dyn, DYN_CNT);
+	size_t *rel = (void *)(base+dyn[DT_REL]);
+	size_t rel_size = dyn[DT_RELSZ];
+	size_t symbolic_rel_cnt = 0;
+	apply_addends_to = rel;
+	for (; rel_size; rel+=2, rel_size-=2*sizeof(size_t))
+		if (!IS_RELATIVE(rel[1])) symbolic_rel_cnt++;
+	if (symbolic_rel_cnt >= ADDEND_LIMIT) a_crash();
+	size_t addends[symbolic_rel_cnt+1];
+	saved_addends = addends;
+
 	head = &ldso;
 	reloc_all(&ldso);
 
 	ldso.relocated = 0;
-	ldso.rel_update_got = 1;
+
+	/* Call dynamic linker stage-3, __dls3, looking it up
+	 * symbolically as a barrier against moving the address
+	 * load across the above relocation processing. */
+	struct symdef dls3_def = find_sym(&ldso, "__dls3", 0);
+	((stage3_func)(ldso.base+dls3_def.sym->st_value))(sp);
 }
 
 /* Stage 3 of the dynamic linker is called with the dynamic linker/libc
--- a/src/locale/__lctrans.c
+++ b/src/locale/__lctrans.c
@@ -16,5 +16,5 @@ const char *__lctrans(const char *msg, c
 
 const char *__lctrans_cur(const char *msg)
 {
-	return __lctrans_impl(msg, CURRENT_LOCALE->cat[LC_MESSAGES-2]);
+	return __lctrans_impl(msg, CURRENT_LOCALE->cat[LC_MESSAGES]);
 }
--- a/src/locale/__setlocalecat.c
+++ /dev/null
@@ -1,111 +0,0 @@
-#include <locale.h>
-#include <string.h>
-#include "locale_impl.h"
-#include "libc.h"
-#include "atomic.h"
-
-const char *__lctrans_impl(const char *msg, const struct __locale_map *lm)
-{
-	const char *trans = 0;
-	if (lm) trans = __mo_lookup(lm->map, lm->map_size, msg);
-	return trans ? trans : msg;
-}
-
-const unsigned char *__map_file(const char *, size_t *);
-int __munmap(void *, size_t);
-char *__strchrnul(const char *, int);
-
-static struct __locale_map *findlocale(const char *name, size_t n)
-{
-	static void *volatile loc_head;
-	struct __locale_map *p, *new, *old_head;
-	const char *path = 0, *z;
-	char buf[256];
-	size_t l;
-	const void *map;
-	size_t map_size;
-
-	for (p=loc_head; p; p=p->next)
-		if (!strcmp(name, p->name)) return p;
-
-	if (!libc.secure) path = getenv("MUSL_LOCPATH");
-	/* FIXME: add a default path? */
-	if (!path) return 0;
-
-	for (; *path; path=z+!!*z) {
-		z = __strchrnul(path, ':');
-		l = z - path - !!*z;
-		if (l >= sizeof buf - n - 2) continue;
-		memcpy(buf, path, l);
-		buf[l] = '/';
-		memcpy(buf+l+1, name, n);
-		buf[l+1+n] = 0;
-		map = __map_file(buf, &map_size);
-		if (map) {
-			new = malloc(sizeof *new);
-			if (!new) {
-				__munmap((void *)map, map_size);
-				return 0;
-			}
-			new->map = map;
-			new->map_size = map_size;
-			memcpy(new->name, name, n);
-			new->name[n] = 0;
-			do {
-				old_head = loc_head;
-				new->next = old_head;
-			} while (a_cas_p(&loc_head, old_head, new) != old_head);
-			return new;
-		}
-	}
-	return 0;
-}
-
-static const char envvars[][12] = {
-	"LC_CTYPE",
-	"LC_NUMERIC",
-	"LC_TIME",
-	"LC_COLLATE",
-	"LC_MONETARY",
-	"LC_MESSAGES",
-};
-
-int __setlocalecat(locale_t loc, int cat, const char *val)
-{
-	if (!*val) {
-		(val = getenv("LC_ALL")) && *val ||
-		(val = getenv(envvars[cat])) && *val ||
-		(val = getenv("LANG")) && *val ||
-		(val = "C.UTF-8");
-	}
-
-	size_t n;
-	for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/'; n++);
-	if (val[0]=='.' || val[n]) val = "C.UTF-8";
-	int builtin = (val[0]=='C' && !val[1])
-		|| !strcmp(val, "C.UTF-8")
-		|| !strcmp(val, "POSIX");
-	struct __locale_map *data, *old;
-
-	switch (cat) {
-	case LC_CTYPE:
-		a_store(&loc->ctype_utf8, !builtin || val[1]=='.');
-		break;
-	case LC_MESSAGES:
-		if (builtin) {
-			loc->messages_name[0] = 0;
-		} else {
-			memcpy(loc->messages_name, val, n);
-			loc->messages_name[n] = 0;
-		}
-		/* fall through */
-	default:
-		data = builtin ? 0 : findlocale(val, n);
-		if (data == loc->cat[cat-2]) break;
-		do old = loc->cat[cat-2];
-		while (a_cas_p(&loc->cat[cat-2], old, data) != old);
-	case LC_NUMERIC:
-		break;
-	}
-	return 0;
-}
--- a/src/locale/dcngettext.c
+++ b/src/locale/dcngettext.c
@@ -84,13 +84,15 @@ char *bindtextdomain(const char *domainn
 }
 
 static const char catnames[][12] = {
+	"LC_CTYPE",
+	"LC_NUMERIC",
 	"LC_TIME",
 	"LC_COLLATE",
 	"LC_MONETARY",
 	"LC_MESSAGES",
 };
 
-static const char catlens[] = { 7, 10, 11, 11 };
+static const char catlens[] = { 8, 10, 7, 10, 11, 11 };
 
 struct msgcat {
 	struct msgcat *next;
@@ -117,10 +119,12 @@ char *dcngettext(const char *domainname,
 	static struct msgcat *volatile cats;
 	struct msgcat *p;
 	struct __locale_struct *loc = CURRENT_LOCALE;
-	struct __locale_map *lm;
+	const struct __locale_map *lm;
 	const char *dirname, *locname, *catname;
 	size_t dirlen, loclen, catlen, domlen;
 
+	if ((unsigned)category >= LC_ALL) goto notrans;
+
 	if (!domainname) domainname = __gettextdomain();
 
 	domlen = strlen(domainname);
@@ -129,25 +133,15 @@ char *dcngettext(const char *domainname,
 	dirname = gettextdir(domainname, &dirlen);
 	if (!dirname) goto notrans;
 
-	switch (category) {
-	case LC_MESSAGES:
-		locname = loc->messages_name;
-		if (!*locname) goto notrans;
-		break;
-	case LC_TIME:
-	case LC_MONETARY:
-	case LC_COLLATE:
-		lm = loc->cat[category-2];
-		if (!lm) goto notrans;
-		locname = lm->name;
-		break;
-	default:
+	lm = loc->cat[category];
+	if (!lm) {
 notrans:
 		return (char *) ((n == 1) ? msgid1 : msgid2);
 	}
+	locname = lm->name;
 
-	catname = catnames[category-2];
-	catlen = catlens[category-2];
+	catname = catnames[category];
+	catlen = catlens[category];
 	loclen = strlen(locname);
 
 	size_t namelen = dirlen+1 + loclen+1 + catlen+1 + domlen+3;
--- a/src/locale/duplocale.c
+++ b/src/locale/duplocale.c
@@ -5,17 +5,10 @@
 
 locale_t __duplocale(locale_t old)
 {
-	locale_t new = calloc(1, sizeof *new + LOCALE_NAME_MAX + 1);
+	locale_t new = malloc(sizeof *new);
 	if (!new) return 0;
-	new->messages_name = (void *)(new+1);
-
 	if (old == LC_GLOBAL_LOCALE) old = &libc.global_locale;
-	new->ctype_utf8 = old->ctype_utf8;
-	if (old->messages_name)
-		strcpy(new->messages_name, old->messages_name);
-
-	for (size_t i=0; i<sizeof new->cat/sizeof new->cat[0]; i++)
-		new->cat[i] = old->cat[i];
+	*new = *old;
 	return new;
 }
 
--- a/src/locale/freelocale.c
+++ b/src/locale/freelocale.c
@@ -2,9 +2,11 @@
 #include "locale_impl.h"
 #include "libc.h"
 
+int __loc_is_allocated(locale_t);
+
 void freelocale(locale_t l)
 {
-	free(l);
+	if (__loc_is_allocated(l)) free(l);
 }
 
 weak_alias(freelocale, __freelocale);
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -23,19 +23,13 @@
 #define BIG5        0340
 #define EUC_KR      0350
 
-/* FIXME: these are not implemented yet
- * EUC:   A1-FE A1-FE
- * GBK:   81-FE 40-7E,80-FE
- * Big5:  A1-FE 40-7E,A1-FE
- */
-
 /* Definitions of charmaps. Each charmap consists of:
  * 1. Empty-string-terminated list of null-terminated aliases.
  * 2. Special type code or number of elided entries.
  * 3. Character table (size determined by field 2). */
 
 static const unsigned char charmaps[] =
-"utf8\0\0\310"
+"utf8\0char\0\0\310"
 "wchart\0\0\306"
 "ucs2\0ucs2be\0\0\304"
 "ucs2le\0\0\305"
@@ -90,6 +84,7 @@ static int fuzzycmp(const unsigned char
 static size_t find_charmap(const void *name)
 {
 	const unsigned char *s;
+	if (!*(char *)name) name=charmaps; /* "utf8" */
 	for (s=charmaps; *s; ) {
 		if (!fuzzycmp(name, s)) {
 			for (; *s; s+=strlen((void *)s)+1);
--- /dev/null
+++ b/src/locale/locale_map.c
@@ -0,0 +1,124 @@
+#include <locale.h>
+#include <string.h>
+#include "locale_impl.h"
+#include "libc.h"
+#include "atomic.h"
+
+const char *__lctrans_impl(const char *msg, const struct __locale_map *lm)
+{
+	const char *trans = 0;
+	if (lm) trans = __mo_lookup(lm->map, lm->map_size, msg);
+	return trans ? trans : msg;
+}
+
+const unsigned char *__map_file(const char *, size_t *);
+int __munmap(void *, size_t);
+char *__strchrnul(const char *, int);
+
+static const char envvars[][12] = {
+	"LC_CTYPE",
+	"LC_NUMERIC",
+	"LC_TIME",
+	"LC_COLLATE",
+	"LC_MONETARY",
+	"LC_MESSAGES",
+};
+
+static const uint32_t empty_mo[] = { 0x950412de, 0, -1, -1, -1 };
+
+const struct __locale_map __c_dot_utf8 = {
+	.map = empty_mo,
+	.map_size = sizeof empty_mo,
+	.name = "C.UTF-8"
+};
+
+const struct __locale_map *__get_locale(int cat, const char *val)
+{
+	static int lock[2];
+	static void *volatile loc_head;
+	const struct __locale_map *p;
+	struct __locale_map *new = 0;
+	const char *path = 0, *z;
+	char buf[256];
+	size_t l, n;
+
+	if (!*val) {
+		(val = getenv("LC_ALL")) && *val ||
+		(val = getenv(envvars[cat])) && *val ||
+		(val = getenv("LANG")) && *val ||
+		(val = "C.UTF-8");
+	}
+
+	/* Limit name length and forbid leading dot or any slashes. */
+	for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/'; n++);
+	if (val[0]=='.' || val[n]) val = "C.UTF-8";
+	int builtin = (val[0]=='C' && !val[1])
+		|| !strcmp(val, "C.UTF-8")
+		|| !strcmp(val, "POSIX");
+
+	if (builtin) {
+		if (cat == LC_CTYPE && val[1]=='.')
+			return (void *)&__c_dot_utf8;
+		return 0;
+	}
+
+	for (p=loc_head; p; p=p->next)
+		if (!strcmp(val, p->name)) return p;
+
+	LOCK(lock);
+
+	for (p=loc_head; p; p=p->next)
+		if (!strcmp(val, p->name)) {
+			UNLOCK(lock);
+			return p;
+		}
+
+	if (!libc.secure) path = getenv("MUSL_LOCPATH");
+	/* FIXME: add a default path? */
+
+	if (path) for (; *path; path=z+!!*z) {
+		z = __strchrnul(path, ':');
+		l = z - path - !!*z;
+		if (l >= sizeof buf - n - 2) continue;
+		memcpy(buf, path, l);
+		buf[l] = '/';
+		memcpy(buf+l+1, val, n);
+		buf[l+1+n] = 0;
+		size_t map_size;
+		const void *map = __map_file(buf, &map_size);
+		if (map) {
+			new = malloc(sizeof *new);
+			if (!new) {
+				__munmap((void *)map, map_size);
+				break;
+			}
+			new->map = map;
+			new->map_size = map_size;
+			memcpy(new->name, val, n);
+			new->name[n] = 0;
+			new->next = loc_head;
+			loc_head = new;
+			break;
+		}
+	}
+
+	/* If no locale definition was found, make a locale map
+	 * object anyway to store the name, which is kept for the
+	 * sake of being able to do message translations at the
+	 * application level. */
+	if (!new && (new = malloc(sizeof *new))) {
+		new->map = empty_mo;
+		new->map_size = sizeof empty_mo;
+		memcpy(new->name, val, n);
+		new->name[n] = 0;
+		new->next = loc_head;
+		loc_head = new;
+	}
+
+	/* For LC_CTYPE, never return a null pointer unless the
+	 * requested name was "C" or "POSIX". */
+	if (!new && cat == LC_CTYPE) new = (void *)&__c_dot_utf8;
+
+	UNLOCK(lock);
+	return new;
+}
--- a/src/locale/newlocale.c
+++ b/src/locale/newlocale.c
@@ -3,22 +3,52 @@
 #include "locale_impl.h"
 #include "libc.h"
 
+extern const struct __locale_map __c_dot_utf8;
+
+static const struct __locale_struct c_locale = { 0 };
+static const struct __locale_struct c_dot_utf8_locale = {
+	.cat[LC_CTYPE] = &__c_dot_utf8
+};
+
+int __loc_is_allocated(locale_t loc)
+{
+	return loc && loc != &c_locale && loc != &c_dot_utf8_locale;
+}
+
 locale_t __newlocale(int mask, const char *name, locale_t loc)
 {
-	int i;
+	int i, j;
+	struct __locale_struct tmp;
+	const struct __locale_map *lm;
 
-	if (!loc) {
-		loc = calloc(1, sizeof *loc + LOCALE_NAME_MAX + 1);
-		if (!loc) return 0;
-		loc->messages_name = (void *)(loc+1);
+	/* For locales with allocated storage, modify in-place. */
+	if (__loc_is_allocated(loc)) {
 		for (i=0; i<LC_ALL; i++)
-			if (!(mask & (1<<i)))
-				__setlocalecat(loc, i, "");
+			if (mask & (1<<i))
+				loc->cat[i] = __get_locale(i, name);
+		return loc;
+	}
+
+	/* Otherwise, build a temporary locale object, which will only
+	 * be instantiated in allocated storage if it does not match
+	 * one of the built-in static locales. This makes the common
+	 * usage case for newlocale, getting a C locale with predictable
+	 * behavior, very fast, and more importantly, fail-safe. */
+	for (j=i=0; i<LC_ALL; i++) {
+		if (loc && !(mask & (1<<i)))
+			lm = loc->cat[i];
+		else
+			lm = __get_locale(i, mask & (1<<i) ? name : "");
+		if (lm) j++;
+		tmp.cat[i] = lm;
 	}
 
-	for (i=0; i<LC_ALL; i++)
-		if (mask & (1<<i))
-			__setlocalecat(loc, i, name);
+	if (!j)
+		return (locale_t)&c_locale;
+	if (j==1 && tmp.cat[LC_CTYPE]==c_dot_utf8_locale.cat[LC_CTYPE])
+		return (locale_t)&c_dot_utf8_locale;
+
+	if ((loc = malloc(sizeof *loc))) *loc = tmp;
 
 	return loc;
 }
--- a/src/locale/setlocale.c
+++ b/src/locale/setlocale.c
@@ -5,73 +5,66 @@
 #include "libc.h"
 #include "atomic.h"
 
-static char buf[2+4*(LOCALE_NAME_MAX+1)];
+static char buf[LC_ALL*(LOCALE_NAME_MAX+1)];
 
-char *setlocale(int cat, const char *name)
+static char *setlocale_one_unlocked(int cat, const char *name)
 {
-	struct __locale_map *lm;
-	int i, j;
+	const struct __locale_map *lm;
 
-	if (!libc.global_locale.messages_name) {
-		libc.global_locale.messages_name =
-			buf + 2 + 3*(LOCALE_NAME_MAX+1);
-	}
+	if (name) libc.global_locale.cat[cat] = lm = __get_locale(cat, name);
+	else lm = libc.global_locale.cat[cat];
+
+	return lm ? (char *)lm->name : "C";
+}
+
+char *__strchrnul(const char *, int);
+
+char *setlocale(int cat, const char *name)
+{
+	static volatile int lock[2];
 
 	if ((unsigned)cat > LC_ALL) return 0;
 
+	LOCK(lock);
+
 	/* For LC_ALL, setlocale is required to return a string which
 	 * encodes the current setting for all categories. The format of
 	 * this string is unspecified, and only the following code, which
 	 * performs both the serialization and deserialization, depends
 	 * on the format, so it can easily be changed if needed. */
 	if (cat == LC_ALL) {
+		int i;
 		if (name) {
-			char part[LOCALE_NAME_MAX+1];
-			if (name[0] && name[1]==';'
-			    && strlen(name) > 2 + 3*(LOCALE_NAME_MAX+1)) {
-				part[0] = name[0];
-				part[1] = 0;
-				setlocale(LC_CTYPE, part);
-				part[LOCALE_NAME_MAX] = 0;
-				for (i=LC_TIME; i<LC_MESSAGES; i++) {
-					memcpy(part, name + 2 + (i-2)*(LOCALE_NAME_MAX+1), LOCALE_NAME_MAX);
-					for (j=LOCALE_NAME_MAX-1; j && part[j]==';'; j--)
-						part[j] = 0;
-					setlocale(i, part);
+			char part[LOCALE_NAME_MAX+1] = "C.UTF-8";
+			const char *p = name;
+			for (i=0; i<LC_ALL; i++) {
+				const char *z = __strchrnul(p, ';');
+				if (z-p <= LOCALE_NAME_MAX) {
+					memcpy(part, p, z-p);
+					part[z-p] = 0;
+					if (*z) p = z+1;
 				}
-				setlocale(LC_MESSAGES, name + 2 + 3*(LOCALE_NAME_MAX+1));
-			} else {
-				for (i=0; i<LC_ALL; i++)
-					setlocale(i, name);
+				setlocale_one_unlocked(i, part);
 			}
 		}
-		memset(buf, ';', 2 + 3*(LOCALE_NAME_MAX+1));
-		buf[0] = libc.global_locale.ctype_utf8 ? 'U' : 'C';
-		for (i=LC_TIME; i<LC_MESSAGES; i++) {
-			lm = libc.global_locale.cat[i-2];
-			if (lm) memcpy(buf + 2 + (i-2)*(LOCALE_NAME_MAX+1),
-				lm->name, strlen(lm->name));
+		char *s = buf;
+		for (i=0; i<LC_ALL; i++) {
+			const struct __locale_map *lm =
+				libc.global_locale.cat[i];
+			const char *part = lm ? lm->name : "C";
+			size_t l = strlen(part);
+			memcpy(s, part, l);
+			s[l] = ';';
+			s += l+1;
 		}
+		*--s = 0;
+		UNLOCK(lock);
 		return buf;
 	}
 
-	if (name) {
-		int adj = libc.global_locale.ctype_utf8;
-		__setlocalecat(&libc.global_locale, cat, name);
-		adj -= libc.global_locale.ctype_utf8;
-		if (adj) a_fetch_add(&libc.bytelocale_cnt_minus_1, adj);
-	}
+	char *ret = setlocale_one_unlocked(cat, name);
 
-	switch (cat) {
-	case LC_CTYPE:
-		return libc.global_locale.ctype_utf8 ? "C.UTF-8" : "C";
-	case LC_NUMERIC:
-		return "C";
-	case LC_MESSAGES:
-		return libc.global_locale.messages_name[0]
-			? libc.global_locale.messages_name : "C";
-	default:
-		lm = libc.global_locale.cat[cat-2];
-		return lm ? lm->name : "C";
-	}
+	UNLOCK(lock);
+
+	return ret;
 }
--- a/src/locale/uselocale.c
+++ b/src/locale/uselocale.c
@@ -10,15 +10,7 @@ locale_t __uselocale(locale_t new)
 
 	if (new == LC_GLOBAL_LOCALE) new = global;
 
-	if (new && new != old) {
-		int adj = 0;
-		if (new == global) a_dec(&libc.uselocale_cnt);
-		else if (!new->ctype_utf8) adj++;
-		if (old == global) a_inc(&libc.uselocale_cnt);
-		else if (!old->ctype_utf8) adj--;
-		a_fetch_add(&libc.bytelocale_cnt_minus_1, adj);
-		self->locale = new;
-	}
+	self->locale = new;
 
 	return old == global ? LC_GLOBAL_LOCALE : old;
 }
--- a/src/stdio/__stdio_read.c
+++ b/src/stdio/__stdio_read.c
@@ -21,7 +21,6 @@ size_t __stdio_read(FILE *f, unsigned ch
 	pthread_cleanup_pop(0);
 	if (cnt <= 0) {
 		f->flags |= F_EOF ^ ((F_ERR^F_EOF) & cnt);
-		f->rpos = f->rend = 0;
 		return cnt;
 	}
 	if (cnt <= iov[0].iov_len) return cnt;
--- a/src/stdio/__toread.c
+++ b/src/stdio/__toread.c
@@ -5,12 +5,12 @@ int __toread(FILE *f)
 	f->mode |= f->mode-1;
 	if (f->wpos > f->buf) f->write(f, 0, 0);
 	f->wpos = f->wbase = f->wend = 0;
-	if (f->flags & (F_EOF|F_NORD)) {
-		if (f->flags & F_NORD) f->flags |= F_ERR;
+	if (f->flags & F_NORD) {
+		f->flags |= F_ERR;
 		return EOF;
 	}
-	f->rpos = f->rend = f->buf;
-	return 0;
+	f->rpos = f->rend = f->buf + f->buf_size;
+	return (f->flags & F_EOF) ? EOF : 0;
 }
 
 void __stdio_exit_needed(void);
--- a/src/stdio/__uflow.c
+++ b/src/stdio/__uflow.c
@@ -1,11 +1,11 @@
 #include "stdio_impl.h"
 
-/* This function will never be called if there is already data
- * buffered for reading. Thus we can get by with very few branches. */
+/* This function assumes it will never be called if there is already
+ * data buffered for reading. */
 
 int __uflow(FILE *f)
 {
 	unsigned char c;
-	if ((f->rend || !__toread(f)) && f->read(f, &c, 1)==1) return c;
+	if (!__toread(f) && f->read(f, &c, 1)==1) return c;
 	return EOF;
 }
--- a/src/stdio/ungetc.c
+++ b/src/stdio/ungetc.c
@@ -6,7 +6,8 @@ int ungetc(int c, FILE *f)
 
 	FLOCK(f);
 
-	if ((!f->rend && __toread(f)) || f->rpos <= f->buf - UNGET) {
+	if (!f->rpos) __toread(f);
+	if (!f->rpos || f->rpos <= f->buf - UNGET) {
 		FUNLOCK(f);
 		return EOF;
 	}
--- a/src/stdio/ungetwc.c
+++ b/src/stdio/ungetwc.c
@@ -19,7 +19,8 @@ wint_t ungetwc(wint_t c, FILE *f)
 
 	f->mode |= f->mode+1;
 
-	if ((!f->rend && __toread(f)) || f->rpos < f->buf - UNGET + l) {
+	if (!f->rpos) __toread(f);
+	if (!f->rpos || f->rpos < f->buf - UNGET + l) {
 		FUNLOCK(f);
 		return EOF;
 	}
--- a/src/thread/i386/__set_thread_area.s
+++ b/src/thread/i386/__set_thread_area.s
@@ -6,10 +6,10 @@ __set_thread_area:
 	push $0x51
 	push $0xfffff
 	push 16(%esp)
-	xor %edx,%edx
-	mov %gs,%dx
-	sub $3,%edx
-	sar $3,%edx
+	call 1f
+1:	addl $4f-1b,(%esp)
+	pop %ecx
+	mov (%ecx),%edx
 	push %edx
 	mov %esp,%ebx
 	xor %eax,%eax
@@ -18,6 +18,7 @@ __set_thread_area:
 	testl %eax,%eax
 	jnz 2f
 	movl (%esp),%edx
+	movl %edx,(%ecx)
 	leal 3(,%edx,8),%edx
 3:	movw %dx,%gs
 1:
@@ -38,3 +39,7 @@ __set_thread_area:
 	mov $7,%dl
 	inc %al
 	jmp 3b
+
+.data
+	.align 4
+4:	.long -1
--- a/src/thread/mips/syscall_cp.s
+++ b/src/thread/mips/syscall_cp.s
@@ -2,10 +2,13 @@
 
 .global __cp_begin
 .hidden __cp_begin
+.type   __cp_begin,@function
 .global __cp_end
 .hidden __cp_end
+.type   __cp_end,@function
 .global __cp_cancel
 .hidden __cp_cancel
+.type   __cp_cancel,@function
 .hidden __cancel
 .global __syscall_cp_asm
 .hidden __syscall_cp_asm
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -67,12 +67,6 @@ _Noreturn void __pthread_exit(void *resu
 		exit(0);
 	}
 
-	if (self->locale != &libc.global_locale) {
-		a_dec(&libc.uselocale_cnt);
-		if (self->locale->ctype_utf8)
-			a_dec(&libc.bytelocale_cnt_minus_1);
-	}
-
 	/* Process robust list in userspace to handle non-pshared mutexes
 	 * and the detached thread case where the robust list head will
 	 * be invalid when the kernel would process it. */