diff options
author | Sergei Trofimovich <slyfox@gentoo.org> | 2017-01-16 09:45:51 +0000 |
---|---|---|
committer | Sergei Trofimovich <slyfox@gentoo.org> | 2017-01-16 09:50:00 +0000 |
commit | ef416f3d1295dfc8fb4a0638a3c5f0cab4f9bab2 (patch) | |
tree | 3d1badf2e0dca0f46173cb7bf791fe8904100757 /dev-lang/ghc/files | |
parent | media-video/subliminal: amd64 stable wrt bug #605776 (diff) | |
download | gentoo-ef416f3d1295dfc8fb4a0638a3c5f0cab4f9bab2.tar.gz gentoo-ef416f3d1295dfc8fb4a0638a3c5f0cab4f9bab2.tar.bz2 gentoo-ef416f3d1295dfc8fb4a0638a3c5f0cab4f9bab2.zip |
dev-lang/ghc: bump up to 8.0.2, no KEYWORDS or binaries yet
Package-Manager: Portage-2.3.3, Repoman-2.3.1
Diffstat (limited to 'dev-lang/ghc/files')
-rw-r--r-- | dev-lang/ghc/files/ghc-8.0.1-limit-jN.patch | 50 | ||||
-rw-r--r-- | dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch | 65 | ||||
-rw-r--r-- | dev-lang/ghc/files/ghc-8.0.1-ww-args-limit.patch | 127 | ||||
-rw-r--r-- | dev-lang/ghc/files/ghc-8.0.1_rc1-cgen-constify.patch | 34 | ||||
-rw-r--r-- | dev-lang/ghc/files/ghc-8.0.2_rc2-old-sphinx.patch | 12 |
5 files changed, 288 insertions, 0 deletions
diff --git a/dev-lang/ghc/files/ghc-8.0.1-limit-jN.patch b/dev-lang/ghc/files/ghc-8.0.1-limit-jN.patch new file mode 100644 index 000000000000..dc5410da3020 --- /dev/null +++ b/dev-lang/ghc/files/ghc-8.0.1-limit-jN.patch @@ -0,0 +1,50 @@ +commit 501e05bb1b8974fc8b6c9eee86c87c367e87a211 +Author: Sergei Trofimovich <slyfox@gentoo.org> +Date: Tue Aug 30 12:10:47 2016 +0100 + + GhcMake: limit Capability count to CPU count in parallel mode + + In Trac #9221 one of problems using high --jobs=<N> + is amount of mutator (or GC) threads we crate. + + We use userspace spinning-and-yielding (see ACQUIRE_SPIN_LOCK) + to acess work stealing queues. In case of + N-worker-threads > N-CPUs fraction of time when + thread holding spin lock gets descheduled by kernel + increases. That causes other threads to waste CPU time + before giving up CPU. + + Signed-off-by: Sergei Trofimovich <siarheit@google.com> + + Test Plan: + ghc --make -j8 and -j80 have comparable sys time + on a 8-core system. + + Reviewers: austin, gintas, bgamari, simonmar + + Reviewed By: bgamari, simonmar + + Subscribers: thomie + + Differential Revision: https://phabricator.haskell.org/D2482 + + GHC Trac Issues: #9221 + +diff --git a/compiler/main/GhcMake.hs b/compiler/main/GhcMake.hs +index 9dc43cd..905df63 100644 +--- a/compiler/main/GhcMake.hs ++++ b/compiler/main/GhcMake.hs +@@ -761,7 +761,12 @@ parUpsweep n_jobs old_hpt stable_mods cleanup sccs = do + + let updNumCapabilities = liftIO $ do + n_capabilities <- getNumCapabilities +- unless (n_capabilities /= 1) $ setNumCapabilities n_jobs ++ n_cpus <- getNumProcessors ++ -- Setting number of capabilities more than ++ -- CPU count usually leads to high userspace ++ -- lock contention. Trac #9221 ++ let n_caps = min n_jobs n_cpus ++ unless (n_capabilities /= 1) $ setNumCapabilities n_caps + return n_capabilities + -- Reset the number of capabilities once the upsweep ends. + let resetNumCapabilities orig_n = liftIO $ setNumCapabilities orig_n diff --git a/dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch b/dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch new file mode 100644 index 000000000000..b46e57301782 --- /dev/null +++ b/dev-lang/ghc/files/ghc-8.0.1-par-g0-on-A32.patch @@ -0,0 +1,65 @@ +commit bdfc5375f219d6def81effda4e57cb56d01fc917 +Author: Sergei Trofimovich <slyfox@gentoo.org> +Date: Tue Aug 30 12:10:54 2016 +0100 + + rts: enable parallel GC scan of large (32M+) allocation area + + Parallel GC does not scan large allocation area (-A) + effectively as it does not do work stealing from nursery + by default. + + That leads to large imbalance when only one of threads + overflows allocation area: most of GC threads finish + quickly (as there is not much to collect) and sit idle + waiting while single GC thread finishes scan of single + allocation area for that thread. + + The patch enables work stealing for (equivalent of -qb0) + allocation area of -A32M or higher. + + Tested on a highlighting-kate package from Trac #9221 + + On 8-core machine the difference is around 5% faster + of wall-clock time. On 24-core VM the speedup is 20%. + + Signed-off-by: Sergei Trofimovich <siarheit@google.com> + + Test Plan: measured wall time and GC parallelism on highlighting-kate build + + Reviewers: austin, bgamari, erikd, simonmar + + Reviewed By: bgamari, simonmar + + Subscribers: thomie + + Differential Revision: https://phabricator.haskell.org/D2483 + + GHC Trac Issues: #9221 + +diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c +index fda33f0..7a719b9 100644 +--- a/rts/RtsFlags.c ++++ b/rts/RtsFlags.c +@@ -237,1 +237,1 @@ void initRtsFlagsDefaults(void) +- RtsFlags.ParFlags.parGcLoadBalancingGen = 1; ++ RtsFlags.ParFlags.parGcLoadBalancingGen = ~0u; /* auto, based on -A */ +@@ -1398,2 +1390,19 @@ static void normaliseRtsOpts (void) + } + ++#ifdef THREADED_RTS ++ if (RtsFlags.ParFlags.parGcLoadBalancingGen == ~0u) { ++ StgWord alloc_area_bytes ++ = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE; ++ ++ // If allocation area is larger that CPU cache ++ // we can finish scanning quicker doing work-stealing ++ // scan. Trac #9221 ++ // 32M looks big enough not to fit into L2 cache ++ // of popular modern CPUs. ++ if (alloc_area_bytes >= 32 * 1024 * 1024) { ++ RtsFlags.ParFlags.parGcLoadBalancingGen = 0; ++ } else { ++ RtsFlags.ParFlags.parGcLoadBalancingGen = 1; ++ } ++ } ++#endif diff --git a/dev-lang/ghc/files/ghc-8.0.1-ww-args-limit.patch b/dev-lang/ghc/files/ghc-8.0.1-ww-args-limit.patch new file mode 100644 index 000000000000..4752f4482678 --- /dev/null +++ b/dev-lang/ghc/files/ghc-8.0.1-ww-args-limit.patch @@ -0,0 +1,127 @@ +commit 5efbf0d243984444cf352ad6f0d147e226c64498 +Author: Sergei Trofimovich <slyfox@gentoo.org> +Date: Thu Sep 1 17:34:58 2016 +0100 + + restore -fmax-worker-args handling (Trac #11565) + + maxWorkerArgs handling was accidentally lost 3 years ago + in a major update of demand analysis + commit 0831a12ea2fc73c33652eeec1adc79fa19700578 + + Old regression is noticeable as: + - code bloat (requires stack reshuffling) + - compilation slowdown (more code to optimise/generate) + - and increased heap usage (DynFlags unboxing/reboxing?) + + On a simple compile benchmark this change causes heap + allocation drop from 70G don to 67G (ghc perf build). + + Signed-off-by: Sergei Trofimovich <siarheit@google.com> + + Reviewers: simonpj, ezyang, goldfire, austin, bgamari + + Reviewed By: simonpj, ezyang + + Subscribers: thomie + + Differential Revision: https://phabricator.haskell.org/D2503 + + GHC Trac Issues: #11565 + +diff --git a/compiler/specialise/SpecConstr.hs b/compiler/specialise/SpecConstr.hs +index 10d5614..7166f57 100644 +--- a/compiler/specialise/SpecConstr.hs ++++ b/compiler/specialise/SpecConstr.hs +@@ -29,7 +29,7 @@ import CoreFVs ( exprsFreeVarsList ) + import CoreMonad + import Literal ( litIsLifted ) + import HscTypes ( ModGuts(..) ) +-import WwLib ( mkWorkerArgs ) ++import WwLib ( isWorkerSmallEnough, mkWorkerArgs ) + import DataCon + import Coercion hiding( substCo ) + import Rules +@@ -1533,10 +1533,14 @@ specialise env bind_calls (RI { ri_fn = fn, ri_lam_bndrs = arg_bndrs + + | Just all_calls <- lookupVarEnv bind_calls fn + = -- pprTrace "specialise entry {" (ppr fn <+> ppr (length all_calls)) $ +- do { (boring_call, pats) <- callsToPats env specs arg_occs all_calls +- ++ do { (boring_call, all_pats) <- callsToPats env specs arg_occs all_calls + -- Bale out if too many specialisations +- ; let n_pats = length pats ++ ; let pats = filter (is_small_enough . fst) all_pats ++ is_small_enough vars = isWorkerSmallEnough (sc_dflags env) vars ++ -- We are about to construct w/w pair in 'spec_one'. ++ -- Omit specialisation leading to high arity workers. ++ -- See Note [Limit w/w arity] ++ n_pats = length pats + spec_count' = n_pats + spec_count + ; case sc_count env of + Just max | not (sc_force env) && spec_count' > max +diff --git a/compiler/stranal/WwLib.hs b/compiler/stranal/WwLib.hs +index 09bc204..d9460d9 100644 +--- a/compiler/stranal/WwLib.hs ++++ b/compiler/stranal/WwLib.hs +@@ -8,6 +8,7 @@ + + module WwLib ( mkWwBodies, mkWWstr, mkWorkerArgs + , deepSplitProductType_maybe, findTypeShape ++ , isWorkerSmallEnough + ) where + + #include "HsVersions.h" +@@ -144,7 +145,8 @@ mkWwBodies dflags fam_envs fun_ty demands res_info one_shots + wrapper_body = wrap_fn_args . wrap_fn_cpr . wrap_fn_str . applyToVars work_call_args . Var + worker_body = mkLams work_lam_args. work_fn_str . work_fn_cpr . work_fn_args + +- ; if useful1 && not (only_one_void_argument) || useful2 ++ ; if isWorkerSmallEnough dflags work_args ++ && (useful1 && not only_one_void_argument || useful2) + then return (Just (worker_args_dmds, wrapper_body, worker_body)) + else return Nothing + } +@@ -165,6 +167,12 @@ mkWwBodies dflags fam_envs fun_ty demands res_info one_shots + | otherwise + = False + ++-- See Note [Limit w/w arity] ++isWorkerSmallEnough :: DynFlags -> [Var] -> Bool ++isWorkerSmallEnough dflags vars = count isId vars <= maxWorkerArgs dflags ++ -- We count only Free variables (isId) to skip Type, Kind ++ -- variables which have no runtime representation. ++ + {- + Note [Always do CPR w/w] + ~~~~~~~~~~~~~~~~~~~~~~~~ +@@ -178,6 +186,30 @@ a disaster, because then the enclosing function might say it has the CPR + property, but now doesn't and there a cascade of disaster. A good example + is Trac #5920. + ++Note [Limit w/w arity] ++~~~~~~~~~~~~~~~~~~~~~~~~ ++Guard against high worker arity as it generates a lot of stack traffic. ++A simplified example is Trac #11565#comment:6 ++ ++Current strategy is very simple: don't perform w/w transformation at all ++if the result produces a wrapper with arity higher than -fmax-worker-args=. ++ ++It is a bit all or nothing, consider ++ ++ f (x,y) (a,b,c,d,e ... , z) = rhs ++ ++Currently we will remove all w/w ness entirely. But actually we could ++w/w on the (x,y) pair... it's the huge product that is the problem. ++ ++Could we instead refrain from w/w on an arg-by-arg basis? Yes, that'd ++solve f. But we can get a lot of args from deeply-nested products: ++ ++ g (a, (b, (c, (d, ...)))) = rhs ++ ++This is harder to spot on an arg-by-arg basis. Previously mkWwStr was ++given some "fuel" saying how many arguments it could add; when we ran ++out of fuel it would stop w/wing. ++Still not very clever because it had a left-right bias. + + ************************************************************************ + * * diff --git a/dev-lang/ghc/files/ghc-8.0.1_rc1-cgen-constify.patch b/dev-lang/ghc/files/ghc-8.0.1_rc1-cgen-constify.patch new file mode 100644 index 000000000000..877a5827e4ea --- /dev/null +++ b/dev-lang/ghc/files/ghc-8.0.1_rc1-cgen-constify.patch @@ -0,0 +1,34 @@ +commit b0cf3ab7a69b878a4335d21a347b56e4b0ca0b7b +Author: Sergei Trofimovich <slyfox@gentoo.org> +Date: Mon Apr 14 19:06:24 2014 +0300 + + compiler/cmm/PprC.hs: constify local string literals + + Consider one-line module + module B (v) where v = "hello" + in -fvia-C mode it generates code like + static char gibberish_str[] = "hello"; + + It uselessly eats data section (precious resource on ia64!). + The patch switches genrator to emit: + static const char gibberish_str[] = "hello"; + + Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org> + +diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs +index 2398981..fdb578d 100644 +--- a/compiler/cmm/PprC.hs ++++ b/compiler/cmm/PprC.hs +@@ -112,6 +112,12 @@ pprTop (CmmProc infos clbl _ graph) = + + -- We only handle (a) arrays of word-sized things and (b) strings. + ++pprTop (CmmData (Section ReadOnlyData _) (Statics lbl [CmmString str])) = ++ hcat [ ++ pprLocalness lbl, ptext (sLit "const char "), ppr lbl, ++ ptext (sLit "[] = "), pprStringInCStyle str, semi ++ ] ++ + pprTop (CmmData _section (Statics lbl [CmmString str])) = + hcat [ + pprLocalness lbl, ptext (sLit "char "), ppr lbl, diff --git a/dev-lang/ghc/files/ghc-8.0.2_rc2-old-sphinx.patch b/dev-lang/ghc/files/ghc-8.0.2_rc2-old-sphinx.patch new file mode 100644 index 000000000000..a4d49d3ef808 --- /dev/null +++ b/dev-lang/ghc/files/ghc-8.0.2_rc2-old-sphinx.patch @@ -0,0 +1,12 @@ +diff --git a/docs/users_guide/profiling.rst b/docs/users_guide/profiling.rst +index 4d0bb3a..f980f72 100644 +--- a/docs/users_guide/profiling.rst ++++ b/docs/users_guide/profiling.rst +@@ -435,7 +435,2 @@ To generate a heap profile from your program: + +-For example, here is a heap profile produced for the ``sphere`` program +-from GHC's ``nofib`` benchmark suite, +- +-.. image:: images/prof_scc.* +- + You might also want to take a look at |