Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions PttWebCrawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
VERIFY = False
requests.packages.urllib3.disable_warnings()


class PttWebCrawler(object):

PTT_URL = 'https://www.ptt.cc'

SEARCH_TITLE = None

"""docstring for PttWebCrawler"""
def __init__(self, cmdline=None, as_lib=False):
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
Expand All @@ -34,6 +34,7 @@ def __init__(self, cmdline=None, as_lib=False):
Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)
''')
parser.add_argument('-b', metavar='BOARD_NAME', help='Board name', required=True)
parser.add_argument('-s', metavar='SEARCH_TITLE', help="Search Title")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-i', metavar=('START_INDEX', 'END_INDEX'), type=int, nargs=2, help="Start and end index")
group.add_argument('-a', metavar='ARTICLE_ID', help="Article ID")
Expand All @@ -45,6 +46,10 @@ def __init__(self, cmdline=None, as_lib=False):
else:
args = parser.parse_args()
board = args.b

if args.s:
self.SEARCH_TITLE = args.s

if args.i:
start = args.i[0]
if args.i[1] == -1:
Expand All @@ -57,16 +62,26 @@ def __init__(self, cmdline=None, as_lib=False):
self.parse_article(article_id, board)

def parse_articles(self, start, end, board, path='.', timeout=3):
filename = board + '-' + str(start) + '-' + str(end) + '.json'
if self.SEARCH_TITLE:
filename = board + '-' + self.SEARCH_TITLE + '-' + str(start) + '-' + str(end) + '.json'
else:
filename = board + '-' + str(start) + '-' + str(end) + '.json'
filename = os.path.join(path, filename)
self.store(filename, u'{"articles": [', 'w')
self.store(filename, u'[', 'w')
for i in range(end-start+1):
index = start + i
print('Processing index:', str(index))
resp = requests.get(
url = self.PTT_URL + '/bbs/' + board + '/index' + str(index) + '.html',
cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
)
if self.SEARCH_TITLE:
resp = requests.get(
url = self.PTT_URL + '/bbs/' + board + '/search', params={'q': self.SEARCH_TITLE},
cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
)
else:
resp = requests.get(
url = self.PTT_URL + '/bbs/' + board + '/index' + str(index) + '.html',
cookies={'over18': '1'}, verify=VERIFY, timeout=timeout
)

if resp.status_code != 200:
print('invalid url:', resp.url)
continue
Expand All @@ -85,7 +100,7 @@ def parse_articles(self, start, end, board, path='.', timeout=3):
except:
pass
time.sleep(0.1)
self.store(filename, u']}', 'a')
self.store(filename, u']', 'a')
return filename

def parse_article(self, article_id, board, path='.'):
Expand Down
83 changes: 83 additions & 0 deletions PttWebCrawler/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import json
import csv
import argparse
# import re

__version__ = '1.0'
# __ITEM_NAME__ = '物品內容 '
# __ITEM_PRICE__ = '交易價格 '
# __ITEM_DETAIL__ = '詳細說明 '

class JsonPaser:
def __init__(self, cmdline=None, as_lib=False):
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
Parsing data from ptt
''')
parser.add_argument('-i', metavar='IN_FILE', help='Inpyt file', required=True)
parser.add_argument('-o', metavar='OUT_FILE', help='Output file', required=True)
parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)

args = parser.parse_args(cmdline)
self.IN_FILE = args.i
self.OUT_FILE = args.o
# self.draw_data()
self.get_info()

def draw_data(self):
print("self.IN_FILE=" + self.IN_FILE)
input_file = open(self.IN_FILE, "r", encoding='utf-8')
json_array = json.load(input_file)

with open('output.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
# writer.writerow(['Title', 'Conetnt'])

for item in json_array:
writer.writerow([item['article_title'], item['content']])

def get_info(self):
__ITEM_NAME__ = '物品內容 '
__ITEM_PRICE__ = '交易價格 '
__ITEM_PRICE_NOTE__ = '(請務必填寫,成交後嚴禁刪修價格) '
__ITEM_DETAIL__ = '詳細說明 '
data = []

with open('output.csv', newline='') as csvfile:
rows = csv.reader(csvfile)
fo = open(self.OUT_FILE, 'w')

for row in rows:
content_data = ''.join(row[1])
data.clear()

#Item Content
startIndex = content_data.find(__ITEM_NAME__) + 5
endIndex = content_data.find(__ITEM_PRICE__)
itemName = content_data[startIndex:endIndex]
data.append(itemName)
# print(itemName)

#Item Price
startIndex = content_data.find(__ITEM_PRICE__) + 5
endIndex = content_data.find(__ITEM_DETAIL__)
itemPrice = content_data[startIndex:endIndex]
noteIndex = itemPrice.find(__ITEM_PRICE_NOTE__)
if -1 != noteIndex:
itemPrice = itemPrice[noteIndex+18:]
data.append(itemPrice)
# print(itemPrice)

#Item Detail
itemDetail = content_data[endIndex:]
data.append(itemDetail)
# print(itemDetail)

# Write to another csv file
for item in data:
fo.write(item + ',')
fo.write('\n')
fo.close()


if __name__ == '__main__':
c = JsonPaser()
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
### 參數說明

```commandline
python crawler.py -b 看板名稱 -i 起始索引 結束索引 (設為負數則以倒數第幾頁計算)
python crawler.py -b 看板名稱 [-s 標題關鍵字搜尋] -i 起始索引 結束索引 (設為負數則以倒數第幾頁計算)
python crawler.py -b 看板名稱 -a 文章ID
```

Expand Down Expand Up @@ -89,6 +89,7 @@ python test.py
optional arguments:
-h, --help show this help message and exit
-b BOARD_NAME Board name
-s SEARCH_TITLE Search article title by keyword
-i START_INDEX END_INDEX Start and end index
-a ARTICLE_ID Article ID
-v, --version show program's version number and exit
Expand Down