jwlin · FishInBottle · May 20, 2019 · May 26, 2019 · Jun 8, 2019 · Jun 8, 2019
diff --git a/PttWebCrawler/crawler.py b/PttWebCrawler/crawler.py
@@ -21,11 +21,11 @@
     VERIFY = False
     requests.packages.urllib3.disable_warnings()
 
-
 class PttWebCrawler(object):
 
     PTT_URL = 'https://www.ptt.cc'
-
+    SEARCH_TITLE = None
+
     """docstring for PttWebCrawler"""
     def __init__(self, cmdline=None, as_lib=False):
         parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
@@ -34,6 +34,7 @@ def __init__(self, cmdline=None, as_lib=False):
             Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)
         ''')
         parser.add_argument('-b', metavar='BOARD_NAME', help='Board name', required=True)
+        parser.add_argument('-s', metavar='SEARCH_TITLE', help="Search Title")
         group = parser.add_mutually_exclusive_group(required=True)
         group.add_argument('-i', metavar=('START_INDEX', 'END_INDEX'), type=int, nargs=2, help="Start and end index")
         group.add_argument('-a', metavar='ARTICLE_ID', help="Article ID")
@@ -45,6 +46,10 @@ def __init__(self, cmdline=None, as_lib=False):
             else:
                 args = parser.parse_args()
             board = args.b
+
+            if args.s:
+                self.SEARCH_TITLE = args.s
+
             if args.i:
                 start = args.i[0]
                 if args.i[1] == -1:
@@ -57,16 +62,26 @@ def __init__(self, cmdline=None, as_lib=False):
                 self.parse_article(article_id, board)
 
     def parse_articles(self, start, end, board, path='.', timeout=3):
-            filename = board + '-' + str(start) + '-' + str(end) + '.json'
+            if self.SEARCH_TITLE:
+                filename = board + '-' + self.SEARCH_TITLE + '-' + str(start) + '-' + str(end) + '.json'
+            else:
+                filename = board + '-' + str(start) + '-' + str(end) + '.json'
             filename = os.path.join(path, filename)
-            self.store(filename, u'{"articles": [', 'w')
+            self.store(filename, u'[', 'w')
             for i in range(end-start+1):
                 index = start + i
                 print('Processing index:', str(index))
-                resp = requests.get(
-                    url = self.PTT_URL + '/bbs/' + board + '/index' + str(index) + '.html',
-                    cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
-                )
+                if self.SEARCH_TITLE:
+                    resp = requests.get(
+                        url = self.PTT_URL + '/bbs/' + board + '/search', params={'q': self.SEARCH_TITLE},
+                        cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
+                    )
+                else:
+                    resp = requests.get(
+                        url = self.PTT_URL + '/bbs/' + board + '/index' + str(index) + '.html',
+                        cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
+                    )
+
                 if resp.status_code != 200:
                     print('invalid url:', resp.url)
                     continue
@@ -85,7 +100,7 @@ def parse_articles(self, start, end, board, path='.', timeout=3):
                     except:
                         pass
                 time.sleep(0.1)
-            self.store(filename, u']}', 'a')
+            self.store(filename, u']', 'a')
             return filename
 
     def parse_article(self, article_id, board, path='.'):

diff --git a/PttWebCrawler/parser.py b/PttWebCrawler/parser.py
@@ -0,0 +1,83 @@
+import json
+import csv
+import argparse
+# import re
+
+__version__ = '1.0'
+# __ITEM_NAME__ = '物品內容 '
+# __ITEM_PRICE__ = '交易價格 '
+# __ITEM_DETAIL__ = '詳細說明 '
+
+class JsonPaser:
+    def __init__(self, cmdline=None, as_lib=False):
+        parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
+            Parsing data from ptt
+        ''')
+        parser.add_argument('-i', metavar='IN_FILE', help='Inpyt file', required=True)
+        parser.add_argument('-o', metavar='OUT_FILE', help='Output file', required=True)
+        parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)
+
+        args = parser.parse_args(cmdline)
+        self.IN_FILE = args.i
+        self.OUT_FILE = args.o
+        # self.draw_data()
+        self.get_info()
+
+    def draw_data(self):
+        print("self.IN_FILE=" + self.IN_FILE)
+        input_file = open(self.IN_FILE, "r", encoding='utf-8')
+        json_array = json.load(input_file)
+
+        with open('output.csv', 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            # writer.writerow(['Title', 'Conetnt'])
+
+            for item in json_array:
+                writer.writerow([item['article_title'], item['content']])
+
+    def get_info(self):
+        __ITEM_NAME__ = '物品內容 '
+        __ITEM_PRICE__ = '交易價格 '
+        __ITEM_PRICE_NOTE__ = '(請務必填寫，成交後嚴禁刪修價格) '
+        __ITEM_DETAIL__ = '詳細說明 '
+        data = []
+
+        with open('output.csv', newline='') as csvfile:
+            rows = csv.reader(csvfile)
+            fo = open(self.OUT_FILE, 'w')
+
+            for row in rows:
+                content_data = ''.join(row[1])
+                data.clear()
+
+                #Item Content
+                startIndex = content_data.find(__ITEM_NAME__) + 5
+                endIndex = content_data.find(__ITEM_PRICE__)
+                itemName = content_data[startIndex:endIndex]
+                data.append(itemName)
+                # print(itemName)
+
+                #Item Price
+                startIndex = content_data.find(__ITEM_PRICE__) + 5
+                endIndex = content_data.find(__ITEM_DETAIL__)
+                itemPrice = content_data[startIndex:endIndex]
+                noteIndex = itemPrice.find(__ITEM_PRICE_NOTE__)
+                if -1 != noteIndex:
+                    itemPrice = itemPrice[noteIndex+18:]
+                data.append(itemPrice)
+                # print(itemPrice)
+
+                #Item Detail
+                itemDetail = content_data[endIndex:]
+                data.append(itemDetail)
+                # print(itemDetail)
+
+                # Write to another csv file
+                for item in data:
+                    fo.write(item + ',')
+                fo.write('\n')
+        fo.close()
+
+
+if __name__ == '__main__':
+    c = JsonPaser()
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@
 ### 參數說明
 
 ```commandline
-python crawler.py -b 看板名稱 -i 起始索引 結束索引 (設為負數則以倒數第幾頁計算) 
+python crawler.py -b 看板名稱 [-s 標題關鍵字搜尋] -i 起始索引 結束索引 (設為負數則以倒數第幾頁計算) 
 python crawler.py -b 看板名稱 -a 文章ID 
 ```
 
@@ -89,6 +89,7 @@ python test.py
     optional arguments:
       -h, --help                  show this help message and exit
       -b BOARD_NAME               Board name
+      -s SEARCH_TITLE             Search article title by keyword
       -i START_INDEX END_INDEX    Start and end index
       -a ARTICLE_ID               Article ID
       -v, --version               show program's version number and exit