1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
| import requests import json from bs4 import BeautifulSoup
class JianShuItem(object):
def __init__(self, title, content): super(JianShuItem, self).__init__() self.title = title self.content = content
def toObj(self): obj = {} obj["title"] = self.title obj["content"] = self.content return obj
def console(self): print(self.title) for c in self.content: print(self.content[c]) print("=========================")
class JianShuCrawler(object):
def __init__(self): super(JianShuCrawler, self).__init__() self.stories = []
def loadPageItems(self, pageIndex): url = 'http://www.jianshu.com/collections/38/notes?order_by=added_at&page=' + \ str(pageIndex) response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") titles = soup.find_all('h4', class_="title") for t in titles: item = JianShuItem(t.select("a")[0].get_text(), self.getDetail(t.select("a")[0]["href"])) item.console() self.stories.append(item.toObj())
def getDetail(self, url): contentUrl = 'http://www.jianshu.com' + url response = requests.get(contentUrl) soup = BeautifulSoup(response.text, "html.parser") content = soup.find('div', attrs={"class": "show-content"}) detailContent = content.select("p") contentArr = {} index = 0 for c in detailContent: contentArr[index] = c.get_text() index += 1 return contentArr
def start(self): for i in range(10): self.loadPageItems(i) self.save()
def save(self): jsonStr = json.dumps(self.stories) print(jsonStr) with open('data.txt', 'wt') as f: f.write(jsonStr)
crawler = JianShuCrawler() crawler.start()
|