From 87d87ed6a5cb7c0b6439e26b8d0f098d156e78ad Mon Sep 17 00:00:00 2001
From: Bensong Liu <bensl@microsoft.com>
Date: Fri, 30 Apr 2021 17:55:09 +0800
Subject: [PATCH] save

---
 README.md   |  4 ++++
 bullshit.py | 23 ++++++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 80943c0..b55da78 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,10 @@ The second section should use the bootstrap data.json and the topic above, to en
 
 The third section is the actual data section. 
 
+## dataset guide
+
+There should not be `,` or `。` in `famous` before `$prefix`. 
+
 ----
 
 # 狗屁不通文章生成器
diff --git a/bullshit.py b/bullshit.py
index 7f4f15b..0a00c62 100644
--- a/bullshit.py
+++ b/bullshit.py
@@ -18,6 +18,11 @@ prefix_data   = list(data['prefixes' ]) # 在famous_data前面弄点nonsense_dat
 postfix_data  = list(data['postfixes']) # 在famous_data后面弄点nonsense_data
 nonsense_data = list(data['shits'    ]) # 代表文章主要nonsense_data来源
 
+famous_bits_count = 6
+prefix_bits_count = 2
+postfix_bits_count = 3
+nonsense_bits_count = 5
+
 print("debug: len=", [len(l) for l in [famous_data, prefix_data, postfix_data, nonsense_data]])
 
 repeat_factor = 2
@@ -55,8 +60,12 @@ def decode(text):
         if paragraph == '':
             continue
 
+        # 1. famous-prefix reorder. 
+
+        # 2. element match and decode bits.
 
-def encode(text, topic, data)
+
+def encode(text, topic, data):
     result = '    '
     curr_paragraph = ''
     curr_data_offset = 0
@@ -65,8 +74,16 @@ def encode(text, topic, data)
             result += curr_paragraph + paragraph_tail()
             curr_paragraph = ''
         elif randint(0,100) < 20 :
-            curr_paragraph += new_famous()
+            # add a famous
+            _index0 = slice_bits(data, curr_data_offset, prefix_bits_count)
+            curr_data_offset += prefix_bits_count
+            _index1 = slice_bits(data, curr_data_offset, famous_bits_count)
+            curr_data_offset += famous_bits_count
+            _index2 = slice_bits(data, curr_data_offset, postfix_bits_count)
+            curr_data_offset += postfix_bits_count
+
+            curr_paragraph += new_famous(famous_data[_index1], prefix_data[_index0], postfix_data[_index2])
         else:
-            curr_paragraph += next(nonsense_generator)
+            # TODO
     result = result.replace("$topic",topic)
     print(result)
-- 
GitLab