{"id":474,"date":"2013-07-17T05:16:40","date_gmt":"2013-07-16T21:16:40","guid":{"rendered":"http:\/\/study.5dimn.com\/?p=474"},"modified":"2020-03-30T11:47:00","modified_gmt":"2020-03-30T03:47:00","slug":"%e7%94%a8python%e6%89%b9%e9%87%8f%e6%8a%93%e5%8f%96%e8%b1%86%e7%93%a3%e6%97%a5%e5%bf%97","status":"publish","type":"post","link":"https:\/\/study.5dimn.com\/?p=474","title":{"rendered":"\u7528Python\u6279\u91cf\u6293\u53d6\u8c46\u74e3\u65e5\u5fd7"},"content":{"rendered":"<p>\u6628\u5929\u5728\u8c46\u74e3\u770b\u5230\u6709\u4eba\u53d1\u72b6\u6001\u95ee\u5982\u4f55\u628a\u8c46\u74e3\u65e5\u5fd7\u4fdd\u5b58\u5230\u672c\u5730\uff0c\u6b63\u597d\u6700\u8fd1\u5728\u5b66python\uff0c\u5c31\u60f3\u5230\u7528python\u5199\u4e00\u6bb5\u5c0f\u7a0b\u5e8f\uff0c\u7ec3\u7ec3\u624b\u3002<\/p>\n<p>\u89e3\u6790HTML\u7528\u7684\u662fBeautifulSoup\u5e93\uff0c\u770b\u4e86\u4e00\u4e0b\u6587\u6863\uff0c\u8fd8\u7b97\u7b80\u5355\uff0c\u4f46\u662f\u6709\u4e9b\u5947\u602a\u7684\u95ee\u9898\u6211\u4e00\u65f6\u5f04\u4e0d\u6e05\u695a\u4e3a\u4ec0\u4e48\uff0c\u6240\u4ee5\u90e8\u5206\u529f\u80fd\u662f\u7528\u4e86\u6bd4\u8f83\u66f2\u6298\u7684\u65b9\u6cd5\u5b9e\u73b0\u7684\u2026\u2026<\/p>\n<p>\u7528\u6cd5\uff1a\u8fd0\u884c\u8fd9\u6bb5\u7a0b\u5e8f\u65f6\uff0c\u5c06\u7528\u6237\u540d\u4f5c\u4e3a\u53c2\u6570\uff0c\u5982\u679c\u4f60\u6ca1\u6709\u8bbe\u7f6e\u8c46\u74e3\u7684\u7528\u6237\u540d\uff0c\u90a3\u5c31\u662f\u4e00\u4e32\u6570\u5b57\uff0c\u4e5f\u5c31\u662f\u4f60\u4e2a\u4eba\u4e3b\u9875\u7684\u7f51\u5740\u91cc\u201cpeople\u201d\u540e\u9762\u90a3\u4e00\u4e32\uff0c\u4f8b\u5982\uff08\u4ee5\u5f53\u521d\u63d0\u95ee\u7684\u8fd9\u4f4d\u540c\u5b66\u4e3a\u4f8b\uff09\uff1a<\/p>\n<blockquote>\n<pre>~$ python crawl.py duanzhang &gt;&gt; blogbackup.txt\n<\/pre>\n<\/blockquote>\n<p>\u6548\u679c\u5982\u4e0b\uff1a<\/p>\n<p><a href=\"http:\/\/study.5dimn.com\/wp-content\/uploads\/2013\/07\/2013-07-16-221457_532x740_scrot.png\" class=\"highslide-image\" onclick=\"return hs.expand(this);\"><img loading=\"lazy\" decoding=\"async\" class=\"size-medium wp-image-478 alignnone\" src=\"http:\/\/study.5dimn.com\/wp-content\/uploads\/2013\/07\/2013-07-16-221457_532x740_scrot-215x300.png\" alt=\"2013-07-16-221457_532x740_scrot\" width=\"215\" height=\"300\" srcset=\"https:\/\/study.5dimn.com\/wp-content\/uploads\/2013\/07\/2013-07-16-221457_532x740_scrot-215x300.png 215w, https:\/\/study.5dimn.com\/wp-content\/uploads\/2013\/07\/2013-07-16-221457_532x740_scrot.png 532w\" sizes=\"auto, (max-width: 215px) 100vw, 215px\" \/><\/a><\/p>\n<p>\u6709\u4e2a\u529f\u80fd\u7f3a\u9677\u662f\uff0c\u5982\u679c\u65e5\u8bb0\u91cc\u6709\u56fe\u7247\uff0c\u53ea\u80fd\u6293\u5230\u90a3\u90e8\u5206\u7684HTML\u4ee3\u7801\uff0c\u6709\u5174\u8da3\u7684\u540c\u5b66\u53ef\u4ee5\u7ee7\u7eed\u6269\u5145\uff0c\u4f46\u662f\u4f30\u8ba1\u5c31\u8981\u60f3\u7740\u4fdd\u5b58\u4e3a\u7f51\u9875\u6587\u4ef6\u4e86\uff0c\u4ee5\u4fbf\u539f\u8c8c\u5448\u73b0\u56fe\u6587\u6df7\u6392\uff0c\u5f53\u7136\uff0c\u4e5f\u53ef\u4ee5\u4ec5\u4ec5\u4f5c\u4e3a\u5907\u4efd\uff0c\u628a\u56fe\u7247\u6293\u4e0b\u6765\u4fdd\u5b58\u5728\u540c\u4e00\u4e2a\u76ee\u5f55\u91cc\uff0c\u5927\u5bb6\u6839\u636e\u81ea\u5df1\u9700\u6c42\u4fee\u6539\u5427\u3002<\/p>\n<p>\u597d\u4e86\uff0c\u5c11\u5e9f\u8bdd\uff0c\u4e0a\u4ee3\u7801\uff1a<\/p>\n\n<div class=\"wp-block-syntaxhighlighter-code \"><pre class=\"brush: python; auto-links: false; title: ; notranslate\" title=\"\">\nfrom bs4 import BeautifulSoup\nimport urllib2\nimport re\nimport sys\n\t\ndef get_each_blog(url):\n\tcontent = urllib2.urlopen(url)\n\tsoup = BeautifulSoup(content)\n\t\t\n\theader = soup.find_all(&quot;div&quot;, attrs={&quot;class&quot;: &quot;note-header&quot;})\n\ttitle = header&#x5B;0].h1.get_text()\n\tdate = header&#x5B;0].span.get_text()\n\tprint(title.encode(&quot;UTF-8&quot;))\n\tprint(date.encode(&quot;UTF-8&quot;))\n\tprint(&#039;\\n&#039;)\n\t\n\tbody = soup.find_all(&quot;div&quot;, attrs={&quot;class&quot;: &quot;note&quot;})\n\tbody = str(body&#x5B;1])\n\tbody = body.replace(&quot;&quot;&quot;&lt;div class=&quot;note&quot; id=&quot;link-report&quot;&gt;&quot;&quot;&quot;, &quot;&quot;)\n\tbody = body.replace(&quot;&lt;\/div&gt;&quot;, &quot;&quot;)\n\tbody = body.replace(&quot;\\n&quot;, &quot;&quot;)\n\tbody = body.replace(&quot;&lt;br\/&gt;&quot;, &quot;\\n&quot;)\n\tprint(body)\n\tprint(&#039;\\n&#039;)\n\tprint(&#039;\\n&#039;)\n\ndef link_list(pageurl):\n\tcontent = urllib2.urlopen(pageurl)\n\tsoup = BeautifulSoup(content)\n\turllist = soup.find_all(&quot;div&quot;, attrs={&quot;class&quot;: &quot;rr&quot;})\n\tfor i in urllist:\n\t\tblog_url= (&#039;http:\/\/www.douban.com\/note\/&#039;+(i&#x5B;&#039;id&#039;].split(&#039;-&#039;))&#x5B;1])\n\t\tget_each_blog(blog_url)\n\nbaseurl = &quot;http:\/\/www.douban.com\/people\/{0}\/notes\/&quot;.format(sys.argv&#x5B;1])\ncontent = urllib2.urlopen(baseurl)\nsoup = BeautifulSoup(content)\nlink_list(baseurl)\npage_list = soup.find_all(&quot;link&quot;, rel=&quot;next&quot;)\nwhile page_list != &#x5B;]:\n\tpageurl = page_list&#x5B;0]&#x5B;&#039;href&#039;]\n\tlink_list(pageurl)\n\tpage_list = BeautifulSoup(urllib2.urlopen(pageurl)).find_all(&quot;link&quot;, rel=&quot;next&quot;)\n<\/pre><\/div>","protected":false},"excerpt":{"rendered":"<p>\u6628\u5929\u5728\u8c46\u74e3\u770b\u5230\u6709\u4eba\u53d1\u72b6\u6001\u95ee\u5982\u4f55\u628a\u8c46\u74e3\u65e5\u5fd7\u4fdd\u5b58\u5230\u672c\u5730\uff0c\u6b63\u597d\u6700\u8fd1\u5728\u5b66python\uff0c\u5c31\u60f3\u5230\u7528python\u5199\u4e00\u6bb5\u5c0f\u7a0b\u5e8f\uff0c\u7ec3\u7ec3\u624b\u3002 \u89e3\u6790HTML\u7528\u7684\u662fBeautifulSoup\u5e93\uff0c\u770b\u4e86\u4e00\u4e0b\u6587\u6863\uff0c\u8fd8\u7b97\u7b80\u5355\uff0c\u4f46\u662f\u6709\u4e9b\u5947\u602a\u7684\u95ee\u9898\u6211\u4e00\u65f6\u5f04\u4e0d\u6e05\u695a\u4e3a\u4ec0\u4e48\uff0c\u6240\u4ee5\u90e8\u5206\u529f\u80fd\u662f\u7528\u4e86\u6bd4\u8f83\u66f2\u6298\u7684\u65b9\u6cd5\u5b9e\u73b0\u7684\u2026\u2026 \u7528\u6cd5\uff1a\u8fd0\u884c\u8fd9\u6bb5\u7a0b\u5e8f\u65f6\uff0c\u5c06\u7528\u6237\u540d\u4f5c\u4e3a\u53c2\u6570\uff0c\u5982\u679c\u4f60\u6ca1\u6709\u8bbe\u7f6e\u8c46\u74e3\u7684\u7528\u6237\u540d\uff0c\u90a3\u5c31\u662f\u4e00\u4e32\u6570\u5b57\uff0c\u4e5f\u5c31\u662f\u4f60\u4e2a\u4eba\u4e3b\u9875\u7684\u7f51\u5740\u91cc\u201cpeople\u201d\u540e\u9762\u90a3\u4e00\u4e32\uff0c\u4f8b\u5982\uff08\u4ee5\u5f53\u521d\u63d0\u95ee\u7684\u8fd9\u4f4d\u540c\u5b66\u4e3a\u4f8b\uff09\uff1a ~$ python crawl.py duanzhang &gt;&gt; blogbackup.txt \u6548\u679c\u5982\u4e0b\uff1a \u6709\u4e2a\u529f\u80fd\u7f3a\u9677\u662f\uff0c\u5982\u679c\u65e5\u8bb0\u91cc\u6709\u56fe\u7247\uff0c\u53ea\u80fd\u6293\u5230\u90a3\u90e8\u5206\u7684HTML\u4ee3\u7801\uff0c\u6709\u5174\u8da3\u7684\u540c\u5b66\u53ef\u4ee5\u7ee7\u7eed\u6269\u5145\uff0c\u4f46\u662f\u4f30\u8ba1\u5c31\u8981\u60f3\u7740\u4fdd\u5b58\u4e3a\u7f51\u9875\u6587\u4ef6\u4e86\uff0c\u4ee5\u4fbf\u539f\u8c8c\u5448\u73b0\u56fe\u6587\u6df7\u6392\uff0c\u5f53\u7136\uff0c\u4e5f\u53ef\u4ee5\u4ec5\u4ec5\u4f5c\u4e3a\u5907\u4efd\uff0c\u628a\u56fe\u7247\u6293\u4e0b\u6765\u4fdd\u5b58\u5728\u540c\u4e00\u4e2a\u76ee\u5f55\u91cc\uff0c\u5927\u5bb6\u6839\u636e\u81ea\u5df1\u9700\u6c42\u4fee\u6539\u5427\u3002 \u597d\u4e86\uff0c\u5c11\u5e9f\u8bdd\uff0c\u4e0a\u4ee3\u7801\uff1a<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[33],"tags":[],"class_list":["post-474","post","type-post","status-publish","format-standard","hentry","category-python","entry"],"_links":{"self":[{"href":"https:\/\/study.5dimn.com\/index.php?rest_route=\/wp\/v2\/posts\/474","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/study.5dimn.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/study.5dimn.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/study.5dimn.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/study.5dimn.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=474"}],"version-history":[{"count":11,"href":"https:\/\/study.5dimn.com\/index.php?rest_route=\/wp\/v2\/posts\/474\/revisions"}],"predecessor-version":[{"id":703,"href":"https:\/\/study.5dimn.com\/index.php?rest_route=\/wp\/v2\/posts\/474\/revisions\/703"}],"wp:attachment":[{"href":"https:\/\/study.5dimn.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=474"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/study.5dimn.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=474"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/study.5dimn.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=474"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}